diff --git a/Cargo.lock b/Cargo.lock index 9dc1fcb..4d0a957 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -107,6 +107,37 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "fuzzt" version = "0.3.1" @@ -119,6 +150,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -131,12 +168,28 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760" +[[package]] +name = "libc" +version = "0.2.174" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" + [[package]] name = "memchr" version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "once_cell_polyfill" version = "1.70.1" @@ -161,6 +214,26 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "regex" version = "1.11.1" @@ -214,6 +287,8 @@ dependencies = [ "clap", "fuzzt", "levenshtein", + "num_cpus", + "rayon", "regex", ] diff --git a/Cargo.toml b/Cargo.toml index 68ab315..29ac387 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,4 +7,6 @@ edition = "2021" clap = { version = "4.5.41", features = ["derive"] } fuzzt = "0.3.1" levenshtein = "1.0.5" +num_cpus = "1.17.0" +rayon = "1.10.0" regex = "1.11.1" diff --git a/output.txt b/output.txt new file mode 100644 index 0000000..53044fc --- /dev/null +++ b/output.txt @@ -0,0 +1,32 @@ +Vorrede, + +Ein schönes Wort jenes Wort des Propheten: Thuet +Salz darein!“ + +Als zu dem Propheten Elisa die Männer von Jericho kam +amen und klagten, daß das Wasser der Stadt böse und das Land +unfruchtbar sey, sprach er: Bringet mir her eine neue Schaale +und thut Salz darein!“ und sie brachten es ihm Hz da ging +er hinaus zu der Wasserquelle und warf das Salz hinein und +machte sie mit dem Worte des Herrn gesund*). + +unser theurer Krummacher hat es am lebhaftesten gefühlt, +daß dieses prophetische runder symbolische Bedeutung hat und +geistlich zu allen Zeiten in der Gemeinde wiederholt werden muß. +wenigstens müssen wir beständig um die geistliche Erneuerung +desselben flehen. Wirt sehnen uns danach, wir erflehen sie. auch +unsere Brunnen sind abgestanden, faul, vergiftet und hauchen +den Tod aus, der nicht eine Stadt und Gegend, sondern eine +Welt zu verderben droht, und in unser Tagens schrecklicher als +jemals vorher wütet. Gene vergifteten Brunnen sind die bis- +sesshaften, die in schrecklicher Loßgebundenheit von allem +göttlichen mit ihren selbstgemachten Gesetzen das gesamte Unis- +versus zu umschließen sich anmaßen die Künste, die ihrem +ursprünglichen Beruf, Weissagerinnen zu seyn vom jenseits, +hohnlachend Chalet gegeben haben, um die Sünde mit dem Glanze +der Verklärung zu umwerben Hz; eine Theologie, die aus dem +Eignens redet, wie der Vater der Lügens, und die inwendig ca- +kanaanitisch gesinnt ficht, den Leviten AA Rob heuchlerisch) umgeworfen +hat; eine Philosophie, welche das Nichte- des Allee- + +Hegel übe. Kunst au. Re. AA diff --git a/src/lib.rs b/src/lib.rs index d1902bc..638794c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,14 @@ pub mod utils { + use std::cell::OnceCell; + use std::cmp::Ordering; use fuzzt::algorithms::{levenshtein, sorensen_dice}; use regex::Regex; use std::fs::File; use std::io::{self, BufRead, Result, Write}; use std::path::Path; - use std::process::exit; + use rayon::prelude::*; - pub fn correct(word: String, list_path: String) -> String { + pub fn correct(word: String, list_path: &str) -> String { let mut list_correct_words: Vec = Vec::new(); if let Ok(lines) = read_lines(list_path) { lines.map_while(Result::ok).for_each(|line| { @@ -14,37 +16,79 @@ pub mod utils { }); } let list_iter = list_correct_words.iter(); - let mut has_match: bool = false; - for correct_word in list_iter.clone() { - if word.eq(correct_word) || word.to_lowercase().eq(correct_word) { - has_match = true; - } - } - if has_match { - return word; - } else { - let mut closest: &str = ""; - let mut closest_dist: usize = 10000; - let mut closest_dist_sorensen: f64 = 0.0f64; - for correct_word in list_iter { - //println!("Checking {}, dist: {}, current closest: {}, (damerau: {}, Sørensen-Dice: {}, Jaro-Winkler: {})", correct_word.as_str(), levenshtein(word.as_str(), correct_word.as_str()), closest_dist, damerau_levenshtein(word.as_str(), correct_word.as_str()), sorensen_dice(word.as_str(), correct_word.as_str()), jaro_winkler(word.as_str(), correct_word.as_str())); - let dist = levenshtein(word.as_str(), correct_word.as_str()); - if dist < closest_dist { + let mut closest: &str = ""; + let mut closest_dist: usize = 10000; + let mut closest_dist_sorensen: f64 = 0.0f64; + for correct_word in list_iter { + let dist = levenshtein(word.as_str(), correct_word.as_str()); + if dist < closest_dist { + closest_dist = dist; + closest = correct_word; + closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str()); + } else if dist == closest_dist { + if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen { closest_dist = dist; closest = correct_word; - closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str()); - } else if dist == closest_dist { - if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen { - closest_dist = dist; - closest = correct_word; - } } } + } + if closest_dist == 0 { + return String::from(word); + } else { return String::from(closest); } } - - pub fn correct_file(file_path: String, list_path: String, output_path: String) { + + struct Distances<'a> { + target: &'a str, + candidate: &'a str, + + levenshtein: usize, + sorensen: OnceCell, + } + impl<'a> Distances<'a> { + fn new(target: &'a str, candidate: &'a str) -> Self { + Self { + target, + candidate, + levenshtein: levenshtein(target, candidate), + sorensen: OnceCell::new(), + } + } + fn calc_sorensen(&self) -> &f64 { + self.sorensen + .get_or_init(|| sorensen_dice(self.target, self.candidate)) + } + fn cmp(&self, other: &Self) -> Ordering { + assert_eq!(self.target, other.target); + match usize::cmp(&self.levenshtein, &other.levenshtein) { + Ordering::Less => Ordering::Less, + Ordering::Equal => { + // intentionally reverse order here + f64::total_cmp(other.calc_sorensen(), self.calc_sorensen()) + } + Ordering::Greater => Ordering::Greater, + } + } + } + fn find<'a>(target: &'a str, candidates: &'a [String]) -> Option> { + candidates + .par_iter() // or .iter() for single-threaded + .map(|candidate| Distances::new(target, candidate)) + .min_by(Distances::cmp) + } + + pub fn correct_concurr(word: String, word_list: &[String]) -> String { + let distances = find(&word,word_list).unwrap(); + if distances.levenshtein == 0 { + return String::from(word); + } else { + return String::from(distances.candidate); + } + + } + + pub fn correct_file(file_path: String, list_path: &str, output_path: String) { let re: Regex = Regex::new(r"[\w]+[\W]+").unwrap(); let re2: Regex = Regex::new(r"(?P[\W&&[^\s]&&[^\n]]*)(?P[\w]+)(?P[\W&&[^\s]&&[^\n]]*)").unwrap(); let mut input_file_lines: Vec = Vec::new(); @@ -53,6 +97,7 @@ pub mod utils { input_file_lines.push(line); }); } + let mut input_words_by_line: Vec> = Vec::new(); for line in input_file_lines { let line_iter = line.split_whitespace(); @@ -87,12 +132,12 @@ pub mod utils { let mut out_str: String = String::from(""); out_str = out_str + special_chars_front - + correct(String::from(text), list_path.clone()).as_str() + + correct(String::from(text), list_path).as_str() + special_chars_back; output.push_str(out_str.as_str()); output.push_str(" "); } else { - output.push_str(correct(String::from(word), list_path.clone()).as_str()); + output.push_str(correct(String::from(word), list_path).as_str()); output.push_str(" "); } } @@ -103,7 +148,77 @@ pub mod utils { Err(e) => panic!("Cannot write output file: {}", e), }; match write!(output_file, "{}", output) { - Ok(_) => exit(1), + Ok(_) => (), + Err(e) => panic!("Cannot write output file: {}", e), + }; + } + + pub fn correct_file_concurr(file_path: String, list_path: &str, output_path: String) { + let re: Regex = Regex::new(r"[\w]+[\W]+").unwrap(); + let re2: Regex = Regex::new(r"(?P[\W&&[^\s]&&[^\n]]*)(?P[\w]+)(?P[\W&&[^\s]&&[^\n]]*)").unwrap(); + let mut input_file_lines: Vec = Vec::new(); + if let Ok(lines) = read_lines(file_path) { + lines.map_while(Result::ok).for_each(|line| { + input_file_lines.push(line); + }); + } + let mut input_words_by_line: Vec> = Vec::new(); + for line in input_file_lines { + let line_iter = line.split_whitespace(); + let mut words: Vec = Vec::new(); + for word in line_iter { + if re.is_match(word) { + let mut word_buffer: String; + word_buffer = + word.replace(&['(', '/', '\"', '\\', '<', '>', '»'], ""); + word_buffer = word_buffer.replace("ſ", "s"); + words.push(word_buffer); + } else { + let push_string = word.replace("ſ", "s"); + words.push(String::from(push_string)); + } + } + input_words_by_line.push(words); + } + let mut list_correct_words: Vec = Vec::new(); + if let Ok(lines) = read_lines(list_path) { + lines.map_while(Result::ok).for_each(|line| { + list_correct_words.push(line); + }); + } + let mut output: String = String::new(); + let input_line_iter = input_words_by_line.iter(); + for line in input_line_iter { + let input_word_iter = line.iter(); + for word in input_word_iter { + let special_chars_front: &str; + let special_chars_back: &str; + let text: &str; + if re.is_match(&word) { + let captures = re2.captures(&word).unwrap(); + special_chars_front = &captures["specialfront"]; + special_chars_back = &captures["specialback"]; + text = &captures["text"]; + let mut out_str: String = String::from(""); + out_str = out_str + + special_chars_front + + correct_concurr(String::from(text), &list_correct_words).as_str() + + special_chars_back; + output.push_str(out_str.as_str()); + output.push_str(" "); + } else { + output.push_str(correct_concurr(String::from(word), &list_correct_words).as_str()); + output.push_str(" "); + } + } + output.push('\n'); + } + let mut output_file: File = match File::create_new(output_path) { + Ok(f) => f, + Err(e) => panic!("Cannot write output file: {}", e), + }; + match write!(output_file, "{}", output) { + Ok(_) => (), Err(e) => panic!("Cannot write output file: {}", e), }; } diff --git a/src/main.rs b/src/main.rs index a82cac2..e151a3f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,7 @@ use clap::{Args, Parser, Subcommand}; use text_correction::utils; +use std::time::SystemTime; +use std::process::*; #[derive(Parser)] #[command(name = "german word corrector")] @@ -13,7 +15,8 @@ struct Cli { #[derive(Subcommand)] enum Commands { CorrectWord(WordArgs), - CorrectFile(FileArgs) + CorrectFile(FileArgs), + BenchFile(FileArgs) } #[derive(Args)] @@ -39,11 +42,35 @@ fn main() { match &cli.command { Commands::CorrectWord(args) => { - let out: String = utils::correct(args.input.clone(), args.list_path.clone()); + let out: String = utils::correct(args.input.clone(), args.list_path.as_str()); println!("{}", out); }, Commands::CorrectFile(args) => { - utils::correct_file(args.input.clone(), args.list_path.clone(), args.output.clone()) + utils::correct_file_concurr(args.input.clone(), args.list_path.as_str(), args.output.clone()) + }, + Commands::BenchFile(args) => { + let start_par = SystemTime::now(); + utils::correct_file_concurr(args.input.clone(), args.list_path.as_str(), args.output.clone()); + let stop_par = match start_par.elapsed() { + Ok(elapsed) => elapsed.as_millis(), + Err(e) => { + println!("Error: {e:?}"); + exit(1); + } + }; + println!("Parallel processing took: {stop_par:?} ms"); + std::fs::remove_file(args.output.clone()).unwrap(); + let start = SystemTime::now(); + utils::correct_file(args.input.clone(), args.list_path.as_str(), args.output.clone()); + let stop = match start.elapsed() { + Ok(elapsed) => elapsed.as_millis(), + Err(e) => { + println!("Error: {e:?}"); + exit(1); + } + }; + println!("Single-thread processing took: {stop:?} ms"); } + } }