introduced parallel processing of iterator with rayon
This commit is contained in:
		
							
								
								
									
										75
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										75
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -107,6 +107,37 @@ version = "1.0.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" | ||||
|  | ||||
| [[package]] | ||||
| name = "crossbeam-deque" | ||||
| version = "0.8.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" | ||||
| dependencies = [ | ||||
|  "crossbeam-epoch", | ||||
|  "crossbeam-utils", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "crossbeam-epoch" | ||||
| version = "0.9.18" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" | ||||
| dependencies = [ | ||||
|  "crossbeam-utils", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "crossbeam-utils" | ||||
| version = "0.8.21" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" | ||||
|  | ||||
| [[package]] | ||||
| name = "either" | ||||
| version = "1.15.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" | ||||
|  | ||||
| [[package]] | ||||
| name = "fuzzt" | ||||
| version = "0.3.1" | ||||
| @@ -119,6 +150,12 @@ version = "0.5.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" | ||||
|  | ||||
| [[package]] | ||||
| name = "hermit-abi" | ||||
| version = "0.5.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" | ||||
|  | ||||
| [[package]] | ||||
| name = "is_terminal_polyfill" | ||||
| version = "1.70.1" | ||||
| @@ -131,12 +168,28 @@ version = "1.0.5" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760" | ||||
|  | ||||
| [[package]] | ||||
| name = "libc" | ||||
| version = "0.2.174" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" | ||||
|  | ||||
| [[package]] | ||||
| name = "memchr" | ||||
| version = "2.7.5" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" | ||||
|  | ||||
| [[package]] | ||||
| name = "num_cpus" | ||||
| version = "1.17.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" | ||||
| dependencies = [ | ||||
|  "hermit-abi", | ||||
|  "libc", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "once_cell_polyfill" | ||||
| version = "1.70.1" | ||||
| @@ -161,6 +214,26 @@ dependencies = [ | ||||
|  "proc-macro2", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "rayon" | ||||
| version = "1.10.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" | ||||
| dependencies = [ | ||||
|  "either", | ||||
|  "rayon-core", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "rayon-core" | ||||
| version = "1.12.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" | ||||
| dependencies = [ | ||||
|  "crossbeam-deque", | ||||
|  "crossbeam-utils", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "regex" | ||||
| version = "1.11.1" | ||||
| @@ -214,6 +287,8 @@ dependencies = [ | ||||
|  "clap", | ||||
|  "fuzzt", | ||||
|  "levenshtein", | ||||
|  "num_cpus", | ||||
|  "rayon", | ||||
|  "regex", | ||||
| ] | ||||
|  | ||||
|   | ||||
| @@ -7,4 +7,6 @@ edition = "2021" | ||||
| clap = { version = "4.5.41", features = ["derive"] } | ||||
| fuzzt = "0.3.1" | ||||
| levenshtein = "1.0.5" | ||||
| num_cpus = "1.17.0" | ||||
| rayon = "1.10.0" | ||||
| regex = "1.11.1" | ||||
|   | ||||
							
								
								
									
										32
									
								
								output.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								output.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | ||||
| Vorrede,  | ||||
|  | ||||
| Ein schönes Wort jenes Wort des Propheten: Thuet  | ||||
| Salz darein!“  | ||||
|  | ||||
| Als zu dem Propheten Elisa die Männer von Jericho kam  | ||||
| amen und klagten, daß das Wasser der Stadt böse und das Land  | ||||
| unfruchtbar sey, sprach er: Bringet mir her eine neue Schaale  | ||||
| und thut Salz darein!“ und sie brachten es ihm Hz da ging  | ||||
| er hinaus zu der Wasserquelle und warf das Salz hinein und  | ||||
| machte sie mit dem Worte des Herrn gesund*).  | ||||
|  | ||||
| unser theurer Krummacher hat es am lebhaftesten gefühlt,  | ||||
| daß dieses prophetische runder symbolische Bedeutung hat und  | ||||
| geistlich zu allen Zeiten in der Gemeinde wiederholt werden muß.  | ||||
| wenigstens müssen wir beständig um die geistliche Erneuerung  | ||||
| desselben flehen. Wirt sehnen uns danach, wir erflehen sie. auch  | ||||
| unsere Brunnen sind abgestanden, faul, vergiftet und hauchen  | ||||
| den Tod aus, der nicht eine Stadt und Gegend, sondern eine  | ||||
| Welt zu verderben droht, und in unser Tagens schrecklicher als  | ||||
| jemals vorher wütet. Gene vergifteten Brunnen sind die bis-  | ||||
| sesshaften, die in schrecklicher Loßgebundenheit von allem  | ||||
| göttlichen mit ihren selbstgemachten Gesetzen das gesamte Unis-  | ||||
| versus zu umschließen sich anmaßen die Künste, die ihrem  | ||||
| ursprünglichen Beruf, Weissagerinnen zu seyn vom jenseits,  | ||||
| hohnlachend Chalet gegeben haben, um die Sünde mit dem Glanze  | ||||
| der Verklärung zu umwerben Hz; eine Theologie, die aus dem  | ||||
| Eignens redet, wie der Vater der Lügens, und die inwendig ca-  | ||||
| kanaanitisch gesinnt ficht, den Leviten AA Rob heuchlerisch) umgeworfen  | ||||
| hat; eine Philosophie, welche das Nichte- des Allee-  | ||||
|  | ||||
| Hegel übe. Kunst au. Re. AA  | ||||
							
								
								
									
										173
									
								
								src/lib.rs
									
									
									
									
									
								
							
							
						
						
									
										173
									
								
								src/lib.rs
									
									
									
									
									
								
							| @@ -1,12 +1,14 @@ | ||||
| pub mod utils { | ||||
|     use std::cell::OnceCell; | ||||
|     use std::cmp::Ordering; | ||||
|     use fuzzt::algorithms::{levenshtein, sorensen_dice}; | ||||
|     use regex::Regex; | ||||
|     use std::fs::File; | ||||
|     use std::io::{self, BufRead, Result, Write}; | ||||
|     use std::path::Path; | ||||
|     use std::process::exit; | ||||
|     use rayon::prelude::*; | ||||
|  | ||||
|     pub fn correct(word: String, list_path: String) -> String { | ||||
|     pub fn correct(word: String, list_path: &str) -> String { | ||||
|         let mut list_correct_words: Vec<String> = Vec::new(); | ||||
|         if let Ok(lines) = read_lines(list_path) { | ||||
|             lines.map_while(Result::ok).for_each(|line| { | ||||
| @@ -14,37 +16,79 @@ pub mod utils { | ||||
|             }); | ||||
|         } | ||||
|         let list_iter = list_correct_words.iter(); | ||||
|         let mut has_match: bool = false; | ||||
|         for correct_word in list_iter.clone() { | ||||
|             if word.eq(correct_word) || word.to_lowercase().eq(correct_word) { | ||||
|                 has_match = true; | ||||
|             } | ||||
|         } | ||||
|         if has_match { | ||||
|             return word; | ||||
|         } else { | ||||
|             let mut closest: &str = ""; | ||||
|             let mut closest_dist: usize = 10000; | ||||
|             let mut closest_dist_sorensen: f64 = 0.0f64; | ||||
|             for correct_word in list_iter { | ||||
|                 //println!("Checking {}, dist: {}, current closest: {}, (damerau: {}, Sørensen-Dice: {}, Jaro-Winkler: {})", correct_word.as_str(), levenshtein(word.as_str(), correct_word.as_str()), closest_dist, damerau_levenshtein(word.as_str(), correct_word.as_str()), sorensen_dice(word.as_str(), correct_word.as_str()), jaro_winkler(word.as_str(), correct_word.as_str())); | ||||
|                 let dist = levenshtein(word.as_str(), correct_word.as_str()); | ||||
|                 if dist < closest_dist { | ||||
|         let mut closest: &str = ""; | ||||
|         let mut closest_dist: usize = 10000; | ||||
|         let mut closest_dist_sorensen: f64 = 0.0f64; | ||||
|         for correct_word in list_iter { | ||||
|             let dist = levenshtein(word.as_str(), correct_word.as_str()); | ||||
|             if dist < closest_dist { | ||||
|                 closest_dist = dist; | ||||
|                 closest = correct_word; | ||||
|                 closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str()); | ||||
|             } else if dist == closest_dist { | ||||
|                 if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen { | ||||
|                     closest_dist = dist; | ||||
|                     closest = correct_word; | ||||
|                     closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str()); | ||||
|                 } else if dist == closest_dist { | ||||
|                     if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen { | ||||
|                         closest_dist = dist; | ||||
|                         closest = correct_word; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         if closest_dist == 0 { | ||||
|             return String::from(word); | ||||
|         } else { | ||||
|             return String::from(closest); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn correct_file(file_path: String, list_path: String, output_path: String) { | ||||
|      | ||||
|     struct Distances<'a> { | ||||
|         target: &'a str, | ||||
|         candidate: &'a str, | ||||
|      | ||||
|         levenshtein: usize, | ||||
|         sorensen: OnceCell<f64>, | ||||
|     } | ||||
|     impl<'a> Distances<'a> { | ||||
|         fn new(target: &'a str, candidate: &'a str) -> Self { | ||||
|             Self { | ||||
|                 target, | ||||
|                 candidate, | ||||
|                 levenshtein: levenshtein(target, candidate), | ||||
|                 sorensen: OnceCell::new(), | ||||
|             } | ||||
|         } | ||||
|         fn calc_sorensen(&self) -> &f64 { | ||||
|             self.sorensen | ||||
|                 .get_or_init(|| sorensen_dice(self.target, self.candidate)) | ||||
|         } | ||||
|         fn cmp(&self, other: &Self) -> Ordering { | ||||
|             assert_eq!(self.target, other.target); | ||||
|             match usize::cmp(&self.levenshtein, &other.levenshtein) { | ||||
|                 Ordering::Less => Ordering::Less, | ||||
|                 Ordering::Equal => { | ||||
|                     // intentionally reverse order here | ||||
|                     f64::total_cmp(other.calc_sorensen(), self.calc_sorensen()) | ||||
|                 } | ||||
|                 Ordering::Greater => Ordering::Greater, | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     fn find<'a>(target: &'a str, candidates: &'a [String]) -> Option<Distances<'a>> { | ||||
|         candidates | ||||
|             .par_iter() // or .iter() for single-threaded | ||||
|             .map(|candidate| Distances::new(target, candidate)) | ||||
|             .min_by(Distances::cmp) | ||||
|     } | ||||
|      | ||||
|     pub fn correct_concurr(word: String, word_list: &[String]) -> String { | ||||
|         let distances = find(&word,word_list).unwrap(); | ||||
|         if distances.levenshtein == 0 { | ||||
|             return String::from(word); | ||||
|         } else { | ||||
|             return String::from(distances.candidate); | ||||
|         } | ||||
|          | ||||
|     } | ||||
|      | ||||
|     pub fn correct_file(file_path: String, list_path: &str, output_path: String) { | ||||
|         let re: Regex = Regex::new(r"[\w]+[\W]+").unwrap(); | ||||
|         let re2: Regex = Regex::new(r"(?P<specialfront>[\W&&[^\s]&&[^\n]]*)(?P<text>[\w]+)(?P<specialback>[\W&&[^\s]&&[^\n]]*)").unwrap(); | ||||
|         let mut input_file_lines: Vec<String> = Vec::new(); | ||||
| @@ -53,6 +97,7 @@ pub mod utils { | ||||
|                 input_file_lines.push(line); | ||||
|             }); | ||||
|         } | ||||
|          | ||||
|         let mut input_words_by_line: Vec<Vec<String>> = Vec::new(); | ||||
|         for line in input_file_lines { | ||||
|             let line_iter = line.split_whitespace(); | ||||
| @@ -87,12 +132,12 @@ pub mod utils { | ||||
|                     let mut out_str: String = String::from(""); | ||||
|                     out_str = out_str | ||||
|                         + special_chars_front | ||||
|                         + correct(String::from(text), list_path.clone()).as_str() | ||||
|                         + correct(String::from(text), list_path).as_str() | ||||
|                         + special_chars_back; | ||||
|                     output.push_str(out_str.as_str()); | ||||
|                     output.push_str(" "); | ||||
|                 } else { | ||||
|                     output.push_str(correct(String::from(word), list_path.clone()).as_str()); | ||||
|                     output.push_str(correct(String::from(word), list_path).as_str()); | ||||
|                     output.push_str(" "); | ||||
|                 } | ||||
|             } | ||||
| @@ -103,7 +148,77 @@ pub mod utils { | ||||
|             Err(e) => panic!("Cannot write output file: {}", e), | ||||
|         }; | ||||
|         match write!(output_file, "{}", output) { | ||||
|             Ok(_) => exit(1), | ||||
|             Ok(_) => (), | ||||
|             Err(e) => panic!("Cannot write output file: {}", e), | ||||
|         }; | ||||
|     } | ||||
|      | ||||
|     pub fn correct_file_concurr(file_path: String, list_path: &str, output_path: String) { | ||||
|         let re: Regex = Regex::new(r"[\w]+[\W]+").unwrap(); | ||||
|         let re2: Regex = Regex::new(r"(?P<specialfront>[\W&&[^\s]&&[^\n]]*)(?P<text>[\w]+)(?P<specialback>[\W&&[^\s]&&[^\n]]*)").unwrap(); | ||||
|         let mut input_file_lines: Vec<String> = Vec::new(); | ||||
|         if let Ok(lines) = read_lines(file_path) { | ||||
|             lines.map_while(Result::ok).for_each(|line| { | ||||
|                 input_file_lines.push(line); | ||||
|             }); | ||||
|         } | ||||
|         let mut input_words_by_line: Vec<Vec<String>> = Vec::new(); | ||||
|         for line in input_file_lines { | ||||
|             let line_iter = line.split_whitespace(); | ||||
|             let mut words: Vec<String> = Vec::new(); | ||||
|             for word in line_iter { | ||||
|                 if re.is_match(word) { | ||||
|                     let mut word_buffer: String; | ||||
|                     word_buffer = | ||||
|                         word.replace(&['(', '/', '\"', '\\', '<', '>', '»'], ""); | ||||
|                     word_buffer = word_buffer.replace("ſ", "s"); | ||||
|                     words.push(word_buffer); | ||||
|                 } else { | ||||
|                     let push_string = word.replace("ſ", "s"); | ||||
|                     words.push(String::from(push_string)); | ||||
|                 } | ||||
|             } | ||||
|             input_words_by_line.push(words); | ||||
|         } | ||||
|         let mut list_correct_words: Vec<String> = Vec::new(); | ||||
|         if let Ok(lines) = read_lines(list_path) { | ||||
|             lines.map_while(Result::ok).for_each(|line| { | ||||
|                 list_correct_words.push(line); | ||||
|             }); | ||||
|         } | ||||
|         let mut output: String = String::new(); | ||||
|         let input_line_iter = input_words_by_line.iter(); | ||||
|         for line in input_line_iter { | ||||
|             let input_word_iter = line.iter(); | ||||
|             for word in input_word_iter { | ||||
|                 let special_chars_front: &str; | ||||
|                 let special_chars_back: &str; | ||||
|                 let text: &str; | ||||
|                 if re.is_match(&word) { | ||||
|                     let captures = re2.captures(&word).unwrap(); | ||||
|                     special_chars_front = &captures["specialfront"]; | ||||
|                     special_chars_back = &captures["specialback"]; | ||||
|                     text = &captures["text"]; | ||||
|                     let mut out_str: String = String::from(""); | ||||
|                     out_str = out_str | ||||
|                         + special_chars_front | ||||
|                         + correct_concurr(String::from(text), &list_correct_words).as_str() | ||||
|                         + special_chars_back; | ||||
|                     output.push_str(out_str.as_str()); | ||||
|                     output.push_str(" "); | ||||
|                 } else { | ||||
|                     output.push_str(correct_concurr(String::from(word), &list_correct_words).as_str()); | ||||
|                     output.push_str(" "); | ||||
|                 } | ||||
|             } | ||||
|             output.push('\n'); | ||||
|         } | ||||
|         let mut output_file: File = match File::create_new(output_path) { | ||||
|             Ok(f) => f, | ||||
|             Err(e) => panic!("Cannot write output file: {}", e), | ||||
|         }; | ||||
|         match write!(output_file, "{}", output) { | ||||
|             Ok(_) => (), | ||||
|             Err(e) => panic!("Cannot write output file: {}", e), | ||||
|         }; | ||||
|     } | ||||
|   | ||||
							
								
								
									
										33
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										33
									
								
								src/main.rs
									
									
									
									
									
								
							| @@ -1,5 +1,7 @@ | ||||
| use clap::{Args, Parser, Subcommand}; | ||||
| use text_correction::utils; | ||||
| use std::time::SystemTime; | ||||
| use std::process::*; | ||||
|  | ||||
| #[derive(Parser)] | ||||
| #[command(name = "german word corrector")] | ||||
| @@ -13,7 +15,8 @@ struct Cli { | ||||
| #[derive(Subcommand)] | ||||
| enum Commands { | ||||
|     CorrectWord(WordArgs), | ||||
|     CorrectFile(FileArgs) | ||||
|     CorrectFile(FileArgs), | ||||
|     BenchFile(FileArgs) | ||||
| } | ||||
|  | ||||
| #[derive(Args)] | ||||
| @@ -39,11 +42,35 @@ fn main() { | ||||
|      | ||||
|     match &cli.command { | ||||
|         Commands::CorrectWord(args) => { | ||||
|             let out: String = utils::correct(args.input.clone(), args.list_path.clone()); | ||||
|             let out: String = utils::correct(args.input.clone(), args.list_path.as_str()); | ||||
|             println!("{}", out); | ||||
|         }, | ||||
|         Commands::CorrectFile(args) => { | ||||
|             utils::correct_file(args.input.clone(), args.list_path.clone(), args.output.clone()) | ||||
|             utils::correct_file_concurr(args.input.clone(), args.list_path.as_str(), args.output.clone()) | ||||
|         }, | ||||
|         Commands::BenchFile(args) => { | ||||
|             let start_par = SystemTime::now(); | ||||
|             utils::correct_file_concurr(args.input.clone(), args.list_path.as_str(), args.output.clone()); | ||||
|             let stop_par = match start_par.elapsed() { | ||||
|                 Ok(elapsed) => elapsed.as_millis(), | ||||
|                 Err(e) => { | ||||
|                     println!("Error: {e:?}"); | ||||
|                     exit(1); | ||||
|                 } | ||||
|             }; | ||||
|             println!("Parallel processing took: {stop_par:?} ms"); | ||||
|             std::fs::remove_file(args.output.clone()).unwrap(); | ||||
|             let start = SystemTime::now(); | ||||
|             utils::correct_file(args.input.clone(), args.list_path.as_str(), args.output.clone()); | ||||
|             let stop = match start.elapsed() { | ||||
|                 Ok(elapsed) => elapsed.as_millis(), | ||||
|                 Err(e) => { | ||||
|                     println!("Error: {e:?}"); | ||||
|                     exit(1); | ||||
|                 } | ||||
|             }; | ||||
|             println!("Single-thread processing took: {stop:?} ms"); | ||||
|         } | ||||
|          | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user