From 90c2d34c03f57625c0ffbbd205a4ec573b525772 Mon Sep 17 00:00:00 2001 From: Martin Brodbeck Date: Thu, 11 Feb 2021 21:58:10 +0100 Subject: [PATCH] xml parsing completely rewritten --- Cargo.lock | 19 ++- Cargo.toml | 4 +- src/epub.rs | 250 +++++++++++++++++++++++++++++ src/main.rs | 453 ++++++++-------------------------------------------- 4 files changed, 326 insertions(+), 400 deletions(-) create mode 100644 src/epub.rs diff --git a/Cargo.lock b/Cargo.lock index 679cb60..4d848cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -148,10 +148,10 @@ dependencies = [ [[package]] name = "pbdbfixer" -version = "0.4.0" +version = "0.5.0" dependencies = [ + "quick-xml", "rusqlite", - "xml-rs", "zip", ] @@ -170,6 +170,15 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "quick-xml" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0452695941410a58c8ce4391707ba9bad26a247173bd9886a05a5e8a8babec75" +dependencies = [ + "memchr", +] + [[package]] name = "quote" version = "1.0.8" @@ -282,12 +291,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "xml-rs" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a" - [[package]] name = "zip" version = "0.5.9" diff --git a/Cargo.toml b/Cargo.toml index 8700c1f..f7b9656 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pbdbfixer" -version = "0.4.0" +version = "0.5.0" authors = ["Martin Brodbeck "] edition = "2018" @@ -8,7 +8,7 @@ edition = "2018" [dependencies] zip = "0.5" -xml-rs = "0.8" +quick-xml = "0.21" [dependencies.rusqlite] version = "0.24" diff --git a/src/epub.rs b/src/epub.rs new file mode 100644 index 0000000..a640d47 --- /dev/null +++ b/src/epub.rs @@ -0,0 +1,250 @@ +use std::{ + collections::HashMap, + fs::{self, File}, + io::Read, +}; + +use quick_xml::{events::Event, Reader}; +use zip::ZipArchive; + +#[derive(Debug)] +pub struct Author { + pub name: String, + pub firstauthor: String, +} + +#[derive(Debug)] +pub struct EpubMetadata { + pub authors: Vec, + pub genre: String, +} + +impl EpubMetadata { + fn new() -> Self { + EpubMetadata { + authors: Vec::new(), + genre: String::new(), + } + } +} + +fn get_rootfile(archive: &mut ZipArchive) -> String { + let mut container = archive.by_name("META-INF/container.xml").unwrap(); + let mut xml_str_buffer = String::new(); + + container.read_to_string(&mut xml_str_buffer).unwrap(); + + let mut reader = Reader::from_str(&xml_str_buffer); + reader.trim_text(true); + + let mut buf = Vec::new(); + let mut opf_filename = String::new(); + + loop { + match reader.read_event(&mut buf) { + Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) if e.local_name() == b"rootfile" => { + opf_filename = String::from_utf8( + e.attributes() + .filter(|attr| attr.as_ref().unwrap().key == b"full-path") + .next() + .unwrap() + .unwrap() + .value + .to_vec(), + ) + .unwrap(); + break; + } + Ok(Event::Eof) => break, + _ => (), + } + } + opf_filename +} + +pub fn get_epub_metadata(filename: &str) -> Option { + let mut epub_meta = EpubMetadata::new(); + let file = fs::File::open(&filename); + + let file = match file { + Err(_) => return None, + Ok(file) => file, + }; + + let mut archive = ZipArchive::new(file).unwrap(); + + let opf_filename = get_rootfile(&mut archive); + + let mut xml_str_buffer = String::new(); + let mut opf = archive.by_name(&opf_filename).unwrap(); + opf.read_to_string(&mut xml_str_buffer).unwrap(); + + let mut reader = Reader::from_str(&xml_str_buffer); + let mut buf = Vec::new(); + + let mut curr_id = String::new(); + let mut creator_found = false; + let mut file_as_found = false; + let mut role_found = false; + let mut genre_found = false; + let mut is_epub3 = false; + + #[derive(Debug)] + struct XmlAut { + name: String, + sort: String, + role: String, + } + + let mut xml_authors = HashMap::new(); + + loop { + match reader.read_event(&mut buf) { + // See if we have EPUB3 or EPUB2 + Ok(Event::Start(ref e)) if e.local_name() == b"package" => { + if e.attributes().any(|attr| { + attr.as_ref().unwrap().key == b"version" + && attr.as_ref().unwrap().value.starts_with(b"3") + }) { + is_epub3 = true; + } + } + Ok(Event::Start(ref e)) if e.local_name() == b"creator" => { + creator_found = true; + if is_epub3 { + if let Some(idval) = e + .attributes() + .filter(|attr| attr.as_ref().unwrap().key == b"id") + .next() + { + curr_id = "#".to_string() + + String::from_utf8(idval.unwrap().value.to_vec()) + .unwrap() + .as_str(); + xml_authors.insert( + curr_id.clone(), + XmlAut { + name: "".to_string(), + sort: "".to_string(), + role: "".to_string(), + }, + ); + } + } else { + if let Some(file_as_val) = e + .attributes() + .filter(|attr| attr.as_ref().unwrap().key.ends_with(b"file-as")) + .next() + { + let ns = + String::from_utf8(file_as_val.as_ref().unwrap().key.to_vec()).unwrap(); + curr_id = "none".to_string() + ns.split(':').collect::>()[0]; + let entry = xml_authors.entry(curr_id.clone()).or_insert(XmlAut { + name: "".to_string(), + sort: "".to_string(), + role: "".to_string(), + }); + entry.sort = file_as_val + .unwrap() + .unescape_and_decode_value(&reader) + .unwrap_or_default(); + entry.role = "aut".to_string(); + } + if let Some(role_val) = e + .attributes() + .filter(|attr| attr.as_ref().unwrap().key.ends_with(b"role")) + .next() + { + let ns = + String::from_utf8(role_val.as_ref().unwrap().key.to_vec()).unwrap(); + curr_id = "none".to_string() + ns.split(':').collect::>()[0]; + } + } + } + Ok(Event::Text(ref e)) if creator_found => { + if is_epub3 { + let entry = xml_authors.entry(curr_id.clone()).or_insert(XmlAut { + name: "".to_string(), + sort: "".to_string(), + role: "".to_string(), + }); + entry.name = String::from_utf8(e.to_vec()).unwrap(); + } else { + let entry = xml_authors.entry(curr_id.clone()).or_insert(XmlAut { + name: "".to_string(), + sort: "".to_string(), + role: "".to_string(), + }); + entry.name = String::from_utf8(e.to_vec()).unwrap(); + entry.role = "aut".to_string(); + } + + creator_found = false; + } + Ok(Event::Start(ref e)) if e.local_name() == b"meta" && is_epub3 => { + if let Some(refines) = e + .attributes() + .filter(|attr| attr.as_ref().unwrap().key == b"refines") + .next() + { + if e.attributes().any(|attr| { + attr.as_ref().unwrap().key == b"property" + && attr.as_ref().unwrap().value.ends_with(b"file-as") + }) { + curr_id = String::from_utf8(refines.unwrap().value.to_vec()).unwrap(); + file_as_found = true; + } else if e.attributes().any(|attr| { + attr.as_ref().unwrap().key == b"property" + && attr.as_ref().unwrap().value.ends_with(b"role") + }) { + curr_id = String::from_utf8(refines.unwrap().value.to_vec()).unwrap(); + role_found = true; + } + } + } + Ok(Event::Text(ref e)) if file_as_found && is_epub3 => { + let entry = xml_authors.entry(curr_id.clone()).or_insert(XmlAut { + name: "".to_string(), + sort: "".to_string(), + role: "".to_string(), + }); + entry.sort = String::from_utf8(e.to_vec()).unwrap(); + + file_as_found = false; + } + Ok(Event::Text(ref e)) if role_found && is_epub3 => { + let entry = xml_authors.entry(curr_id.clone()).or_insert(XmlAut { + name: "".to_string(), + sort: "".to_string(), + role: "".to_string(), + }); + entry.role = String::from_utf8(e.to_vec()).unwrap(); + + role_found = false; + } + Ok(Event::Start(ref e)) if e.local_name() == b"subject" => { + genre_found = true; + } + Ok(Event::Text(ref e)) if genre_found => { + //epub_meta.genre = String::from_utf8(e.to_vec()).unwrap(); + epub_meta.genre = e.unescape_and_decode(&reader).unwrap(); + genre_found = false; + } + Ok(Event::Eof) => break, + _ => (), + } + } + + //println!("{:?}", &xml_authors); + + epub_meta.authors = xml_authors + .into_iter() + .filter(|&(_, ref xml_author)| &xml_author.role == "aut" && &xml_author.name.len() > &0) + .map(|(_key, value)| Author { + name: value.name, + firstauthor: value.sort, + }) + .collect(); + + Some(epub_meta) +} diff --git a/src/main.rs b/src/main.rs index 1bb944d..06d25f3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,338 +1,8 @@ +mod epub; mod pocketbook; -use rusqlite::{named_params, Connection, Result, Transaction, NO_PARAMS}; -use std::{collections::HashMap, fs::File}; -use std::{error::Error, io::Read}; -use std::{io::BufReader, usize}; -use xml::reader::{EventReader, ParserConfig, XmlEvent}; -use zip::{read::ZipFile, ZipArchive}; - -fn get_root_file(mut container: ZipFile) -> Result, Box> { - let mut buf = String::new(); - container.read_to_string(&mut buf).unwrap(); - - // Get rid of the BOM mark, if any - if buf.starts_with("\u{feff}") { - buf = buf.strip_prefix("\u{feff}").unwrap().to_owned(); - } - - let parser = EventReader::new(BufReader::new(buf.as_bytes())); - - for e in parser { - match e { - Ok(XmlEvent::StartElement { - name, attributes, .. - }) if name.local_name == "rootfile" => { - for attr in attributes { - if attr.name.local_name == "full-path" { - return Ok(Some(attr.value)); - } - } - } - Err(e) => { - return Err(Box::new(e)); - } - _ => {} - } - } - Ok(None) -} - -struct Refine { - role: String, - file_as: String, -} - -fn get_attribute_file_as(opf: ZipFile) -> Option { - let parser = ParserConfig::new() - .trim_whitespace(true) - .ignore_comments(true) - .coalesce_characters(true) - .create_reader(opf); - - let mut is_epub3 = false; - let mut creator_ids = Vec::new(); - let mut refines_found = false; - let mut role_found = false; - let mut refine_entries = HashMap::new(); - let mut curr_id = String::new(); - - for e in parser { - match e { - Ok(XmlEvent::StartElement { - name, attributes, .. - }) if name.local_name == "package" => { - for attr in attributes { - if attr.name.local_name == "version" { - if attr.value.starts_with("3") == true { - is_epub3 = true; - } - } - } - } - Ok(XmlEvent::StartElement { - name, attributes, .. - }) if name.local_name == "creator" => { - for attr in attributes { - if attr.name.local_name == "file-as" { - return Some(attr.value); - } - if is_epub3 && attr.name.local_name == "id" { - creator_ids.push("#".to_owned() + attr.value.as_str()); - } - } - } - Ok(XmlEvent::StartElement { - name, attributes, .. - }) if name.local_name == "meta" => { - if attributes.iter().any(|attr| { - attr.name.local_name == "refines" && creator_ids.contains(&attr.value) - }) && attributes - .iter() - .any(|attr| attr.name.local_name == "property" && attr.value == "file-as") - { - refines_found = true; - curr_id = attributes - .iter() - .find(|a| a.name.local_name == "refines") - .unwrap() - .value - .clone(); - } else if attributes.iter().any(|attr| { - attr.name.local_name == "refines" && creator_ids.contains(&attr.value) - }) && attributes - .iter() - .any(|attr| attr.name.local_name == "property" && attr.value == "role") - { - role_found = true; - curr_id = attributes - .iter() - .find(|a| a.name.local_name == "refines") - .unwrap() - .value - .clone(); - } - } - Ok(XmlEvent::Characters(value)) => { - if role_found == true { - if value == "aut" { - let entry = refine_entries.entry(curr_id.clone()).or_insert(Refine { - role: "".to_string(), - file_as: "".to_string(), - }); - entry.role = value; - } - role_found = false; - } else if refines_found == true { - let entry = refine_entries.entry(curr_id.clone()).or_insert(Refine { - role: "".to_string(), - file_as: "".to_string(), - }); - entry.file_as = value; - refines_found = false; - } - } - Ok(XmlEvent::StartElement { .. }) => { - if refines_found == true { - refines_found = false; - } - } - Err(_e) => { - break; - } - _ => {} - } - } - - if refine_entries.len() == 1 { - return Some(refine_entries.values().next().unwrap().file_as.clone()); - } else if refine_entries.len() >= 2 { - return Some( - refine_entries - .values() - .into_iter() - .filter(|v| v.role == "aut") - .map(|v| v.file_as.clone()) - .collect::>() - .join(" & "), - ); - } - - None -} - -struct Creator { - role: String, - name: String, -} - -fn get_attribute_creator(opf: ZipFile) -> Option { - let parser = ParserConfig::new() - .trim_whitespace(true) - .ignore_comments(true) - .coalesce_characters(true) - .create_reader(opf); - - let mut is_epub3 = false; - let mut creator_found = true; - let mut creator_ids = Vec::new(); - let mut role_found = false; - let mut creator_entries = HashMap::new(); - let mut epub2_creator_entries = Vec::new(); - let mut curr_id = String::new(); - - for e in parser { - match e { - Ok(XmlEvent::StartElement { - name, attributes, .. - }) if name.local_name == "package" => { - for attr in attributes { - if attr.name.local_name == "version" { - if attr.value.starts_with("3") == true { - is_epub3 = true; - } - } - } - } - Ok(XmlEvent::StartElement { - name, attributes, .. - }) if name.local_name == "creator" => { - creator_found = true; - if !is_epub3 { - match attributes - .iter() - .find(|attr| attr.name.local_name == "role") - { - Some(attr) => { - epub2_creator_entries.push(Creator { - role: attr.value.clone(), - name: "".to_string(), - }); - } - None => { - epub2_creator_entries.push(Creator { - role: "aut".to_string(), - name: "".to_string(), - }); - } - } - } - for attr in attributes { - if is_epub3 && attr.name.local_name == "id" { - creator_ids.push("#".to_owned() + attr.value.as_str()); - //creator_entries.insert(attr.value.clone(), Creator{role: "".to_string(), name: "".to_string()}); - curr_id = "#".to_owned() + attr.value.as_str(); - } - } - } - Ok(XmlEvent::StartElement { - name, attributes, .. - }) if name.local_name == "meta" => { - if attributes.iter().any(|attr| { - attr.name.local_name == "refines" && creator_ids.contains(&attr.value) - }) && attributes - .iter() - .any(|attr| attr.name.local_name == "property" && attr.value == "role") - { - role_found = true; - curr_id = attributes - .iter() - .find(|a| a.name.local_name == "refines") - .unwrap() - .value - .clone(); - } - } - Ok(XmlEvent::Characters(value)) => { - if creator_found && is_epub3 == false { - epub2_creator_entries.last_mut().unwrap().name = value.clone(); - } else if creator_found && is_epub3 == true { - let entry = creator_entries.entry(curr_id.clone()).or_insert(Creator { - role: "".to_string(), - name: "".to_string(), - }); - entry.name = value; - creator_found = false; - } else if role_found == true { - let entry = creator_entries.entry(curr_id.clone()).or_insert(Creator { - role: "".to_string(), - name: "".to_string(), - }); - entry.role = value; - role_found = false; - } - } - Ok(XmlEvent::StartElement { .. }) => { - if creator_found == true { - creator_found = false; - } - } - Err(e) => { - println!("{}", e); - break; - } - _ => {} - } - } - - if !is_epub3 && epub2_creator_entries.len() >= 1 { - return Some( - epub2_creator_entries - .into_iter() - .filter(|v| v.role == "aut") - .map(|v| v.name.clone()) - .collect::>() - .join(", "), - ); - } else if creator_entries.len() >= 1 { - return Some( - creator_entries - .values() - .into_iter() - .filter(|v| v.role == "aut") - .map(|v| v.name.clone()) - .collect::>() - .join(", "), - ); - } - - None -} - -fn get_attribute_genre(opf: ZipFile) -> Option { - let parser = ParserConfig::new() - .trim_whitespace(true) - .ignore_comments(true) - .coalesce_characters(true) - .create_reader(opf); - - let mut genre_found = false; - - for e in parser { - match e { - Ok(XmlEvent::StartElement { name, .. }) if name.local_name == "subject" => { - genre_found = true; - } - Ok(XmlEvent::Characters(value)) => { - if genre_found { - return Some(value); - } - } - Ok(XmlEvent::StartElement { .. }) => { - if genre_found == true { - genre_found = false; - } - } - Err(e) => { - println!("{}", e); - break; - } - _ => {} - } - } - - None -} +use rusqlite::{named_params, Connection, Transaction, NO_PARAMS}; +use std::usize; struct BookEntry { id: i32, @@ -460,67 +130,70 @@ fn fix_db_entries(tx: &Transaction, book_entries: &Vec) -> Statistics continue; } - let file = File::open(entry.filepath.as_str()); - let file = match file { - Err(_) => continue, - Ok(file) => file, - }; + if let Some(epub_metadata) = epub::get_epub_metadata(&entry.filepath) { + let authors = epub_metadata + .authors + .iter() + .filter(|aut| aut.firstauthor.len() > 0) + .collect::>(); - let mut archive = ZipArchive::new(BufReader::new(file)).unwrap(); + // Fix firstauthor… + let firstauthors = authors + .iter() + .map(|aut| aut.firstauthor.clone()) + .collect::>(); + if !firstauthors.iter().all(|s| entry.firstauthor.contains(s)) { + let mut stmt = tx + .prepare("UPDATE books_impl SET firstauthor = :file_as WHERE id = :book_id") + .unwrap(); + stmt.execute_named( + named_params![":file_as": firstauthors.join(" & "), ":book_id": entry.id], + ) + .unwrap(); + stat.authors_fixed = stat.authors_fixed + 1; - let container = archive.by_name("META-INF/container.xml").unwrap(); - - if let Some(opf_file) = get_root_file(container).unwrap() { - let opf = archive.by_name(opf_file.as_str()).unwrap(); - // firstauthor… - if let Some(file_as) = get_attribute_file_as(opf) { - if !file_as.split(" & ").all(|s| entry.firstauthor.contains(s)) { - let mut stmt = tx - .prepare("UPDATE books_impl SET firstauthor = :file_as WHERE id = :book_id") - .unwrap(); - stmt.execute_named(named_params![":file_as": file_as, ":book_id": entry.id]) - .unwrap(); - stat.authors_fixed = stat.authors_fixed + 1; - } + println!("{}", firstauthors.join(" & ")); } - let opf = archive.by_name(opf_file.as_str()).unwrap(); - // author… - if let Some(creator) = get_attribute_creator(opf) { - if !creator.split(", ").all(|s| entry.author.contains(s)) - || creator.len() < entry.author.len() - { - let mut stmt = tx - .prepare("UPDATE books_impl SET author = :creator WHERE id = :book_id") - .unwrap(); - stmt.execute_named(named_params![":creator": creator, ":book_id": entry.id]) - .unwrap(); - stat.authors_fixed = stat.authors_fixed + 1; - } + + // Fix author names… + let authornames = authors + .iter() + .map(|aut| aut.name.clone()) + .collect::>(); + if !authornames.iter().all(|s| entry.author.contains(s)) { + let mut stmt = tx + .prepare("UPDATE books_impl SET author = :authors WHERE id = :book_id") + .unwrap(); + stmt.execute_named( + named_params![":authors": authornames.join(", "), ":book_id": entry.id], + ) + .unwrap(); + stat.authors_fixed = stat.authors_fixed + 1; + + println!("{}", authornames.join(" & ")); } - // genre… - if entry.genre.is_empty() { - let opf = archive.by_name(opf_file.as_str()).unwrap(); - if let Some(genre) = get_attribute_genre(opf) { - let mut stmt = tx - .prepare( - r#"INSERT INTO genres (name) SELECT :genre ON CONFLICT DO NOTHING"#, - ) - .unwrap(); - stmt.execute_named(named_params![":genre": &genre]).unwrap(); - let mut stmt = tx - .prepare( - r#" - INSERT INTO booktogenre (bookid, genreid) - VALUES (:bookid, - (SELECT id FROM genres WHERE name = :genre) - ) - ON CONFLICT DO NOTHING"#, - ) - .unwrap(); - stmt.execute_named(named_params![":bookid": &entry.id, ":genre": &genre]) - .unwrap(); - stat.genres_fixed = stat.genres_fixed + 1; - } + + if entry.genre.is_empty() && epub_metadata.genre.len() > 0 { + let mut stmt = tx + .prepare(r#"INSERT INTO genres (name) SELECT :genre ON CONFLICT DO NOTHING"#) + .unwrap(); + stmt.execute_named(named_params![":genre": &epub_metadata.genre]) + .unwrap(); + let mut stmt = tx + .prepare( + r#" + INSERT INTO booktogenre (bookid, genreid) + VALUES (:bookid, + (SELECT id FROM genres WHERE name = :genre) + ) + ON CONFLICT DO NOTHING"#, + ) + .unwrap(); + stmt.execute_named( + named_params![":bookid": &entry.id, ":genre": &epub_metadata.genre], + ) + .unwrap(); + stat.genres_fixed = stat.genres_fixed + 1; } } }