xml parsing completely rewritten

This commit is contained in:
Martin Brodbeck 2021-02-11 21:58:10 +01:00
parent 1cf8b008d0
commit 90c2d34c03
4 changed files with 326 additions and 400 deletions

19
Cargo.lock generated
View File

@ -148,10 +148,10 @@ dependencies = [
[[package]] [[package]]
name = "pbdbfixer" name = "pbdbfixer"
version = "0.4.0" version = "0.5.0"
dependencies = [ dependencies = [
"quick-xml",
"rusqlite", "rusqlite",
"xml-rs",
"zip", "zip",
] ]
@ -170,6 +170,15 @@ dependencies = [
"unicode-xid", "unicode-xid",
] ]
[[package]]
name = "quick-xml"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0452695941410a58c8ce4391707ba9bad26a247173bd9886a05a5e8a8babec75"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "quote" name = "quote"
version = "1.0.8" version = "1.0.8"
@ -282,12 +291,6 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "xml-rs"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a"
[[package]] [[package]]
name = "zip" name = "zip"
version = "0.5.9" version = "0.5.9"

View File

@ -1,6 +1,6 @@
[package] [package]
name = "pbdbfixer" name = "pbdbfixer"
version = "0.4.0" version = "0.5.0"
authors = ["Martin Brodbeck <martin@brodbeck-online.de>"] authors = ["Martin Brodbeck <martin@brodbeck-online.de>"]
edition = "2018" edition = "2018"
@ -8,7 +8,7 @@ edition = "2018"
[dependencies] [dependencies]
zip = "0.5" zip = "0.5"
xml-rs = "0.8" quick-xml = "0.21"
[dependencies.rusqlite] [dependencies.rusqlite]
version = "0.24" version = "0.24"

250
src/epub.rs Normal file
View File

@ -0,0 +1,250 @@
use std::{
collections::HashMap,
fs::{self, File},
io::Read,
};
use quick_xml::{events::Event, Reader};
use zip::ZipArchive;
#[derive(Debug)]
pub struct Author {
pub name: String,
pub firstauthor: String,
}
#[derive(Debug)]
pub struct EpubMetadata {
pub authors: Vec<Author>,
pub genre: String,
}
impl EpubMetadata {
fn new() -> Self {
EpubMetadata {
authors: Vec::new(),
genre: String::new(),
}
}
}
fn get_rootfile(archive: &mut ZipArchive<File>) -> String {
let mut container = archive.by_name("META-INF/container.xml").unwrap();
let mut xml_str_buffer = String::new();
container.read_to_string(&mut xml_str_buffer).unwrap();
let mut reader = Reader::from_str(&xml_str_buffer);
reader.trim_text(true);
let mut buf = Vec::new();
let mut opf_filename = String::new();
loop {
match reader.read_event(&mut buf) {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) if e.local_name() == b"rootfile" => {
opf_filename = String::from_utf8(
e.attributes()
.filter(|attr| attr.as_ref().unwrap().key == b"full-path")
.next()
.unwrap()
.unwrap()
.value
.to_vec(),
)
.unwrap();
break;
}
Ok(Event::Eof) => break,
_ => (),
}
}
opf_filename
}
pub fn get_epub_metadata(filename: &str) -> Option<EpubMetadata> {
let mut epub_meta = EpubMetadata::new();
let file = fs::File::open(&filename);
let file = match file {
Err(_) => return None,
Ok(file) => file,
};
let mut archive = ZipArchive::new(file).unwrap();
let opf_filename = get_rootfile(&mut archive);
let mut xml_str_buffer = String::new();
let mut opf = archive.by_name(&opf_filename).unwrap();
opf.read_to_string(&mut xml_str_buffer).unwrap();
let mut reader = Reader::from_str(&xml_str_buffer);
let mut buf = Vec::new();
let mut curr_id = String::new();
let mut creator_found = false;
let mut file_as_found = false;
let mut role_found = false;
let mut genre_found = false;
let mut is_epub3 = false;
#[derive(Debug)]
struct XmlAut {
name: String,
sort: String,
role: String,
}
let mut xml_authors = HashMap::new();
loop {
match reader.read_event(&mut buf) {
// See if we have EPUB3 or EPUB2
Ok(Event::Start(ref e)) if e.local_name() == b"package" => {
if e.attributes().any(|attr| {
attr.as_ref().unwrap().key == b"version"
&& attr.as_ref().unwrap().value.starts_with(b"3")
}) {
is_epub3 = true;
}
}
Ok(Event::Start(ref e)) if e.local_name() == b"creator" => {
creator_found = true;
if is_epub3 {
if let Some(idval) = e
.attributes()
.filter(|attr| attr.as_ref().unwrap().key == b"id")
.next()
{
curr_id = "#".to_string()
+ String::from_utf8(idval.unwrap().value.to_vec())
.unwrap()
.as_str();
xml_authors.insert(
curr_id.clone(),
XmlAut {
name: "".to_string(),
sort: "".to_string(),
role: "".to_string(),
},
);
}
} else {
if let Some(file_as_val) = e
.attributes()
.filter(|attr| attr.as_ref().unwrap().key.ends_with(b"file-as"))
.next()
{
let ns =
String::from_utf8(file_as_val.as_ref().unwrap().key.to_vec()).unwrap();
curr_id = "none".to_string() + ns.split(':').collect::<Vec<&str>>()[0];
let entry = xml_authors.entry(curr_id.clone()).or_insert(XmlAut {
name: "".to_string(),
sort: "".to_string(),
role: "".to_string(),
});
entry.sort = file_as_val
.unwrap()
.unescape_and_decode_value(&reader)
.unwrap_or_default();
entry.role = "aut".to_string();
}
if let Some(role_val) = e
.attributes()
.filter(|attr| attr.as_ref().unwrap().key.ends_with(b"role"))
.next()
{
let ns =
String::from_utf8(role_val.as_ref().unwrap().key.to_vec()).unwrap();
curr_id = "none".to_string() + ns.split(':').collect::<Vec<&str>>()[0];
}
}
}
Ok(Event::Text(ref e)) if creator_found => {
if is_epub3 {
let entry = xml_authors.entry(curr_id.clone()).or_insert(XmlAut {
name: "".to_string(),
sort: "".to_string(),
role: "".to_string(),
});
entry.name = String::from_utf8(e.to_vec()).unwrap();
} else {
let entry = xml_authors.entry(curr_id.clone()).or_insert(XmlAut {
name: "".to_string(),
sort: "".to_string(),
role: "".to_string(),
});
entry.name = String::from_utf8(e.to_vec()).unwrap();
entry.role = "aut".to_string();
}
creator_found = false;
}
Ok(Event::Start(ref e)) if e.local_name() == b"meta" && is_epub3 => {
if let Some(refines) = e
.attributes()
.filter(|attr| attr.as_ref().unwrap().key == b"refines")
.next()
{
if e.attributes().any(|attr| {
attr.as_ref().unwrap().key == b"property"
&& attr.as_ref().unwrap().value.ends_with(b"file-as")
}) {
curr_id = String::from_utf8(refines.unwrap().value.to_vec()).unwrap();
file_as_found = true;
} else if e.attributes().any(|attr| {
attr.as_ref().unwrap().key == b"property"
&& attr.as_ref().unwrap().value.ends_with(b"role")
}) {
curr_id = String::from_utf8(refines.unwrap().value.to_vec()).unwrap();
role_found = true;
}
}
}
Ok(Event::Text(ref e)) if file_as_found && is_epub3 => {
let entry = xml_authors.entry(curr_id.clone()).or_insert(XmlAut {
name: "".to_string(),
sort: "".to_string(),
role: "".to_string(),
});
entry.sort = String::from_utf8(e.to_vec()).unwrap();
file_as_found = false;
}
Ok(Event::Text(ref e)) if role_found && is_epub3 => {
let entry = xml_authors.entry(curr_id.clone()).or_insert(XmlAut {
name: "".to_string(),
sort: "".to_string(),
role: "".to_string(),
});
entry.role = String::from_utf8(e.to_vec()).unwrap();
role_found = false;
}
Ok(Event::Start(ref e)) if e.local_name() == b"subject" => {
genre_found = true;
}
Ok(Event::Text(ref e)) if genre_found => {
//epub_meta.genre = String::from_utf8(e.to_vec()).unwrap();
epub_meta.genre = e.unescape_and_decode(&reader).unwrap();
genre_found = false;
}
Ok(Event::Eof) => break,
_ => (),
}
}
//println!("{:?}", &xml_authors);
epub_meta.authors = xml_authors
.into_iter()
.filter(|&(_, ref xml_author)| &xml_author.role == "aut" && &xml_author.name.len() > &0)
.map(|(_key, value)| Author {
name: value.name,
firstauthor: value.sort,
})
.collect();
Some(epub_meta)
}

View File

@ -1,338 +1,8 @@
mod epub;
mod pocketbook; mod pocketbook;
use rusqlite::{named_params, Connection, Result, Transaction, NO_PARAMS}; use rusqlite::{named_params, Connection, Transaction, NO_PARAMS};
use std::{collections::HashMap, fs::File}; use std::usize;
use std::{error::Error, io::Read};
use std::{io::BufReader, usize};
use xml::reader::{EventReader, ParserConfig, XmlEvent};
use zip::{read::ZipFile, ZipArchive};
fn get_root_file(mut container: ZipFile) -> Result<Option<String>, Box<dyn Error>> {
let mut buf = String::new();
container.read_to_string(&mut buf).unwrap();
// Get rid of the BOM mark, if any
if buf.starts_with("\u{feff}") {
buf = buf.strip_prefix("\u{feff}").unwrap().to_owned();
}
let parser = EventReader::new(BufReader::new(buf.as_bytes()));
for e in parser {
match e {
Ok(XmlEvent::StartElement {
name, attributes, ..
}) if name.local_name == "rootfile" => {
for attr in attributes {
if attr.name.local_name == "full-path" {
return Ok(Some(attr.value));
}
}
}
Err(e) => {
return Err(Box::new(e));
}
_ => {}
}
}
Ok(None)
}
struct Refine {
role: String,
file_as: String,
}
fn get_attribute_file_as(opf: ZipFile) -> Option<String> {
let parser = ParserConfig::new()
.trim_whitespace(true)
.ignore_comments(true)
.coalesce_characters(true)
.create_reader(opf);
let mut is_epub3 = false;
let mut creator_ids = Vec::new();
let mut refines_found = false;
let mut role_found = false;
let mut refine_entries = HashMap::new();
let mut curr_id = String::new();
for e in parser {
match e {
Ok(XmlEvent::StartElement {
name, attributes, ..
}) if name.local_name == "package" => {
for attr in attributes {
if attr.name.local_name == "version" {
if attr.value.starts_with("3") == true {
is_epub3 = true;
}
}
}
}
Ok(XmlEvent::StartElement {
name, attributes, ..
}) if name.local_name == "creator" => {
for attr in attributes {
if attr.name.local_name == "file-as" {
return Some(attr.value);
}
if is_epub3 && attr.name.local_name == "id" {
creator_ids.push("#".to_owned() + attr.value.as_str());
}
}
}
Ok(XmlEvent::StartElement {
name, attributes, ..
}) if name.local_name == "meta" => {
if attributes.iter().any(|attr| {
attr.name.local_name == "refines" && creator_ids.contains(&attr.value)
}) && attributes
.iter()
.any(|attr| attr.name.local_name == "property" && attr.value == "file-as")
{
refines_found = true;
curr_id = attributes
.iter()
.find(|a| a.name.local_name == "refines")
.unwrap()
.value
.clone();
} else if attributes.iter().any(|attr| {
attr.name.local_name == "refines" && creator_ids.contains(&attr.value)
}) && attributes
.iter()
.any(|attr| attr.name.local_name == "property" && attr.value == "role")
{
role_found = true;
curr_id = attributes
.iter()
.find(|a| a.name.local_name == "refines")
.unwrap()
.value
.clone();
}
}
Ok(XmlEvent::Characters(value)) => {
if role_found == true {
if value == "aut" {
let entry = refine_entries.entry(curr_id.clone()).or_insert(Refine {
role: "".to_string(),
file_as: "".to_string(),
});
entry.role = value;
}
role_found = false;
} else if refines_found == true {
let entry = refine_entries.entry(curr_id.clone()).or_insert(Refine {
role: "".to_string(),
file_as: "".to_string(),
});
entry.file_as = value;
refines_found = false;
}
}
Ok(XmlEvent::StartElement { .. }) => {
if refines_found == true {
refines_found = false;
}
}
Err(_e) => {
break;
}
_ => {}
}
}
if refine_entries.len() == 1 {
return Some(refine_entries.values().next().unwrap().file_as.clone());
} else if refine_entries.len() >= 2 {
return Some(
refine_entries
.values()
.into_iter()
.filter(|v| v.role == "aut")
.map(|v| v.file_as.clone())
.collect::<Vec<String>>()
.join(" & "),
);
}
None
}
struct Creator {
role: String,
name: String,
}
fn get_attribute_creator(opf: ZipFile) -> Option<String> {
let parser = ParserConfig::new()
.trim_whitespace(true)
.ignore_comments(true)
.coalesce_characters(true)
.create_reader(opf);
let mut is_epub3 = false;
let mut creator_found = true;
let mut creator_ids = Vec::new();
let mut role_found = false;
let mut creator_entries = HashMap::new();
let mut epub2_creator_entries = Vec::new();
let mut curr_id = String::new();
for e in parser {
match e {
Ok(XmlEvent::StartElement {
name, attributes, ..
}) if name.local_name == "package" => {
for attr in attributes {
if attr.name.local_name == "version" {
if attr.value.starts_with("3") == true {
is_epub3 = true;
}
}
}
}
Ok(XmlEvent::StartElement {
name, attributes, ..
}) if name.local_name == "creator" => {
creator_found = true;
if !is_epub3 {
match attributes
.iter()
.find(|attr| attr.name.local_name == "role")
{
Some(attr) => {
epub2_creator_entries.push(Creator {
role: attr.value.clone(),
name: "".to_string(),
});
}
None => {
epub2_creator_entries.push(Creator {
role: "aut".to_string(),
name: "".to_string(),
});
}
}
}
for attr in attributes {
if is_epub3 && attr.name.local_name == "id" {
creator_ids.push("#".to_owned() + attr.value.as_str());
//creator_entries.insert(attr.value.clone(), Creator{role: "".to_string(), name: "".to_string()});
curr_id = "#".to_owned() + attr.value.as_str();
}
}
}
Ok(XmlEvent::StartElement {
name, attributes, ..
}) if name.local_name == "meta" => {
if attributes.iter().any(|attr| {
attr.name.local_name == "refines" && creator_ids.contains(&attr.value)
}) && attributes
.iter()
.any(|attr| attr.name.local_name == "property" && attr.value == "role")
{
role_found = true;
curr_id = attributes
.iter()
.find(|a| a.name.local_name == "refines")
.unwrap()
.value
.clone();
}
}
Ok(XmlEvent::Characters(value)) => {
if creator_found && is_epub3 == false {
epub2_creator_entries.last_mut().unwrap().name = value.clone();
} else if creator_found && is_epub3 == true {
let entry = creator_entries.entry(curr_id.clone()).or_insert(Creator {
role: "".to_string(),
name: "".to_string(),
});
entry.name = value;
creator_found = false;
} else if role_found == true {
let entry = creator_entries.entry(curr_id.clone()).or_insert(Creator {
role: "".to_string(),
name: "".to_string(),
});
entry.role = value;
role_found = false;
}
}
Ok(XmlEvent::StartElement { .. }) => {
if creator_found == true {
creator_found = false;
}
}
Err(e) => {
println!("{}", e);
break;
}
_ => {}
}
}
if !is_epub3 && epub2_creator_entries.len() >= 1 {
return Some(
epub2_creator_entries
.into_iter()
.filter(|v| v.role == "aut")
.map(|v| v.name.clone())
.collect::<Vec<String>>()
.join(", "),
);
} else if creator_entries.len() >= 1 {
return Some(
creator_entries
.values()
.into_iter()
.filter(|v| v.role == "aut")
.map(|v| v.name.clone())
.collect::<Vec<String>>()
.join(", "),
);
}
None
}
fn get_attribute_genre(opf: ZipFile) -> Option<String> {
let parser = ParserConfig::new()
.trim_whitespace(true)
.ignore_comments(true)
.coalesce_characters(true)
.create_reader(opf);
let mut genre_found = false;
for e in parser {
match e {
Ok(XmlEvent::StartElement { name, .. }) if name.local_name == "subject" => {
genre_found = true;
}
Ok(XmlEvent::Characters(value)) => {
if genre_found {
return Some(value);
}
}
Ok(XmlEvent::StartElement { .. }) => {
if genre_found == true {
genre_found = false;
}
}
Err(e) => {
println!("{}", e);
break;
}
_ => {}
}
}
None
}
struct BookEntry { struct BookEntry {
id: i32, id: i32,
@ -460,67 +130,70 @@ fn fix_db_entries(tx: &Transaction, book_entries: &Vec<BookEntry>) -> Statistics
continue; continue;
} }
let file = File::open(entry.filepath.as_str()); if let Some(epub_metadata) = epub::get_epub_metadata(&entry.filepath) {
let file = match file { let authors = epub_metadata
Err(_) => continue, .authors
Ok(file) => file, .iter()
}; .filter(|aut| aut.firstauthor.len() > 0)
.collect::<Vec<_>>();
let mut archive = ZipArchive::new(BufReader::new(file)).unwrap(); // Fix firstauthor…
let firstauthors = authors
.iter()
.map(|aut| aut.firstauthor.clone())
.collect::<Vec<_>>();
if !firstauthors.iter().all(|s| entry.firstauthor.contains(s)) {
let mut stmt = tx
.prepare("UPDATE books_impl SET firstauthor = :file_as WHERE id = :book_id")
.unwrap();
stmt.execute_named(
named_params![":file_as": firstauthors.join(" & "), ":book_id": entry.id],
)
.unwrap();
stat.authors_fixed = stat.authors_fixed + 1;
let container = archive.by_name("META-INF/container.xml").unwrap(); println!("{}", firstauthors.join(" & "));
if let Some(opf_file) = get_root_file(container).unwrap() {
let opf = archive.by_name(opf_file.as_str()).unwrap();
// firstauthor…
if let Some(file_as) = get_attribute_file_as(opf) {
if !file_as.split(" & ").all(|s| entry.firstauthor.contains(s)) {
let mut stmt = tx
.prepare("UPDATE books_impl SET firstauthor = :file_as WHERE id = :book_id")
.unwrap();
stmt.execute_named(named_params![":file_as": file_as, ":book_id": entry.id])
.unwrap();
stat.authors_fixed = stat.authors_fixed + 1;
}
} }
let opf = archive.by_name(opf_file.as_str()).unwrap();
// author… // Fix author names…
if let Some(creator) = get_attribute_creator(opf) { let authornames = authors
if !creator.split(", ").all(|s| entry.author.contains(s)) .iter()
|| creator.len() < entry.author.len() .map(|aut| aut.name.clone())
{ .collect::<Vec<_>>();
let mut stmt = tx if !authornames.iter().all(|s| entry.author.contains(s)) {
.prepare("UPDATE books_impl SET author = :creator WHERE id = :book_id") let mut stmt = tx
.unwrap(); .prepare("UPDATE books_impl SET author = :authors WHERE id = :book_id")
stmt.execute_named(named_params![":creator": creator, ":book_id": entry.id]) .unwrap();
.unwrap(); stmt.execute_named(
stat.authors_fixed = stat.authors_fixed + 1; named_params![":authors": authornames.join(", "), ":book_id": entry.id],
} )
.unwrap();
stat.authors_fixed = stat.authors_fixed + 1;
println!("{}", authornames.join(" & "));
} }
// genre…
if entry.genre.is_empty() { if entry.genre.is_empty() && epub_metadata.genre.len() > 0 {
let opf = archive.by_name(opf_file.as_str()).unwrap(); let mut stmt = tx
if let Some(genre) = get_attribute_genre(opf) { .prepare(r#"INSERT INTO genres (name) SELECT :genre ON CONFLICT DO NOTHING"#)
let mut stmt = tx .unwrap();
.prepare( stmt.execute_named(named_params![":genre": &epub_metadata.genre])
r#"INSERT INTO genres (name) SELECT :genre ON CONFLICT DO NOTHING"#, .unwrap();
) let mut stmt = tx
.unwrap(); .prepare(
stmt.execute_named(named_params![":genre": &genre]).unwrap(); r#"
let mut stmt = tx INSERT INTO booktogenre (bookid, genreid)
.prepare( VALUES (:bookid,
r#" (SELECT id FROM genres WHERE name = :genre)
INSERT INTO booktogenre (bookid, genreid) )
VALUES (:bookid, ON CONFLICT DO NOTHING"#,
(SELECT id FROM genres WHERE name = :genre) )
) .unwrap();
ON CONFLICT DO NOTHING"#, stmt.execute_named(
) named_params![":bookid": &entry.id, ":genre": &epub_metadata.genre],
.unwrap(); )
stmt.execute_named(named_params![":bookid": &entry.id, ":genre": &genre]) .unwrap();
.unwrap(); stat.genres_fixed = stat.genres_fixed + 1;
stat.genres_fixed = stat.genres_fixed + 1;
}
} }
} }
} }