introduced parallel processing of iterator with rayon
This commit is contained in:
75
Cargo.lock
generated
75
Cargo.lock
generated
@@ -107,6 +107,37 @@ version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
||||
dependencies = [
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
||||
|
||||
[[package]]
|
||||
name = "fuzzt"
|
||||
version = "0.3.1"
|
||||
@@ -119,6 +150,12 @@ version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.1"
|
||||
@@ -131,12 +168,28 @@ version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.174"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell_polyfill"
|
||||
version = "1.70.1"
|
||||
@@ -161,6 +214,26 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
||||
dependencies = [
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.11.1"
|
||||
@@ -214,6 +287,8 @@ dependencies = [
|
||||
"clap",
|
||||
"fuzzt",
|
||||
"levenshtein",
|
||||
"num_cpus",
|
||||
"rayon",
|
||||
"regex",
|
||||
]
|
||||
|
||||
|
@@ -7,4 +7,6 @@ edition = "2021"
|
||||
clap = { version = "4.5.41", features = ["derive"] }
|
||||
fuzzt = "0.3.1"
|
||||
levenshtein = "1.0.5"
|
||||
num_cpus = "1.17.0"
|
||||
rayon = "1.10.0"
|
||||
regex = "1.11.1"
|
||||
|
32
output.txt
Normal file
32
output.txt
Normal file
@@ -0,0 +1,32 @@
|
||||
Vorrede,
|
||||
|
||||
Ein schönes Wort jenes Wort des Propheten: Thuet
|
||||
Salz darein!“
|
||||
|
||||
Als zu dem Propheten Elisa die Männer von Jericho kam
|
||||
amen und klagten, daß das Wasser der Stadt böse und das Land
|
||||
unfruchtbar sey, sprach er: Bringet mir her eine neue Schaale
|
||||
und thut Salz darein!“ und sie brachten es ihm Hz da ging
|
||||
er hinaus zu der Wasserquelle und warf das Salz hinein und
|
||||
machte sie mit dem Worte des Herrn gesund*).
|
||||
|
||||
unser theurer Krummacher hat es am lebhaftesten gefühlt,
|
||||
daß dieses prophetische runder symbolische Bedeutung hat und
|
||||
geistlich zu allen Zeiten in der Gemeinde wiederholt werden muß.
|
||||
wenigstens müssen wir beständig um die geistliche Erneuerung
|
||||
desselben flehen. Wirt sehnen uns danach, wir erflehen sie. auch
|
||||
unsere Brunnen sind abgestanden, faul, vergiftet und hauchen
|
||||
den Tod aus, der nicht eine Stadt und Gegend, sondern eine
|
||||
Welt zu verderben droht, und in unser Tagens schrecklicher als
|
||||
jemals vorher wütet. Gene vergifteten Brunnen sind die bis-
|
||||
sesshaften, die in schrecklicher Loßgebundenheit von allem
|
||||
göttlichen mit ihren selbstgemachten Gesetzen das gesamte Unis-
|
||||
versus zu umschließen sich anmaßen die Künste, die ihrem
|
||||
ursprünglichen Beruf, Weissagerinnen zu seyn vom jenseits,
|
||||
hohnlachend Chalet gegeben haben, um die Sünde mit dem Glanze
|
||||
der Verklärung zu umwerben Hz; eine Theologie, die aus dem
|
||||
Eignens redet, wie der Vater der Lügens, und die inwendig ca-
|
||||
kanaanitisch gesinnt ficht, den Leviten AA Rob heuchlerisch) umgeworfen
|
||||
hat; eine Philosophie, welche das Nichte- des Allee-
|
||||
|
||||
Hegel übe. Kunst au. Re. AA
|
173
src/lib.rs
173
src/lib.rs
@@ -1,12 +1,14 @@
|
||||
pub mod utils {
|
||||
use std::cell::OnceCell;
|
||||
use std::cmp::Ordering;
|
||||
use fuzzt::algorithms::{levenshtein, sorensen_dice};
|
||||
use regex::Regex;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufRead, Result, Write};
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use rayon::prelude::*;
|
||||
|
||||
pub fn correct(word: String, list_path: String) -> String {
|
||||
pub fn correct(word: String, list_path: &str) -> String {
|
||||
let mut list_correct_words: Vec<String> = Vec::new();
|
||||
if let Ok(lines) = read_lines(list_path) {
|
||||
lines.map_while(Result::ok).for_each(|line| {
|
||||
@@ -14,37 +16,79 @@ pub mod utils {
|
||||
});
|
||||
}
|
||||
let list_iter = list_correct_words.iter();
|
||||
let mut has_match: bool = false;
|
||||
for correct_word in list_iter.clone() {
|
||||
if word.eq(correct_word) || word.to_lowercase().eq(correct_word) {
|
||||
has_match = true;
|
||||
}
|
||||
}
|
||||
if has_match {
|
||||
return word;
|
||||
} else {
|
||||
let mut closest: &str = "";
|
||||
let mut closest_dist: usize = 10000;
|
||||
let mut closest_dist_sorensen: f64 = 0.0f64;
|
||||
for correct_word in list_iter {
|
||||
//println!("Checking {}, dist: {}, current closest: {}, (damerau: {}, Sørensen-Dice: {}, Jaro-Winkler: {})", correct_word.as_str(), levenshtein(word.as_str(), correct_word.as_str()), closest_dist, damerau_levenshtein(word.as_str(), correct_word.as_str()), sorensen_dice(word.as_str(), correct_word.as_str()), jaro_winkler(word.as_str(), correct_word.as_str()));
|
||||
let dist = levenshtein(word.as_str(), correct_word.as_str());
|
||||
if dist < closest_dist {
|
||||
let mut closest: &str = "";
|
||||
let mut closest_dist: usize = 10000;
|
||||
let mut closest_dist_sorensen: f64 = 0.0f64;
|
||||
for correct_word in list_iter {
|
||||
let dist = levenshtein(word.as_str(), correct_word.as_str());
|
||||
if dist < closest_dist {
|
||||
closest_dist = dist;
|
||||
closest = correct_word;
|
||||
closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str());
|
||||
} else if dist == closest_dist {
|
||||
if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen {
|
||||
closest_dist = dist;
|
||||
closest = correct_word;
|
||||
closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str());
|
||||
} else if dist == closest_dist {
|
||||
if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen {
|
||||
closest_dist = dist;
|
||||
closest = correct_word;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if closest_dist == 0 {
|
||||
return String::from(word);
|
||||
} else {
|
||||
return String::from(closest);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn correct_file(file_path: String, list_path: String, output_path: String) {
|
||||
|
||||
struct Distances<'a> {
|
||||
target: &'a str,
|
||||
candidate: &'a str,
|
||||
|
||||
levenshtein: usize,
|
||||
sorensen: OnceCell<f64>,
|
||||
}
|
||||
impl<'a> Distances<'a> {
|
||||
fn new(target: &'a str, candidate: &'a str) -> Self {
|
||||
Self {
|
||||
target,
|
||||
candidate,
|
||||
levenshtein: levenshtein(target, candidate),
|
||||
sorensen: OnceCell::new(),
|
||||
}
|
||||
}
|
||||
fn calc_sorensen(&self) -> &f64 {
|
||||
self.sorensen
|
||||
.get_or_init(|| sorensen_dice(self.target, self.candidate))
|
||||
}
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
assert_eq!(self.target, other.target);
|
||||
match usize::cmp(&self.levenshtein, &other.levenshtein) {
|
||||
Ordering::Less => Ordering::Less,
|
||||
Ordering::Equal => {
|
||||
// intentionally reverse order here
|
||||
f64::total_cmp(other.calc_sorensen(), self.calc_sorensen())
|
||||
}
|
||||
Ordering::Greater => Ordering::Greater,
|
||||
}
|
||||
}
|
||||
}
|
||||
fn find<'a>(target: &'a str, candidates: &'a [String]) -> Option<Distances<'a>> {
|
||||
candidates
|
||||
.par_iter() // or .iter() for single-threaded
|
||||
.map(|candidate| Distances::new(target, candidate))
|
||||
.min_by(Distances::cmp)
|
||||
}
|
||||
|
||||
pub fn correct_concurr(word: String, word_list: &[String]) -> String {
|
||||
let distances = find(&word,word_list).unwrap();
|
||||
if distances.levenshtein == 0 {
|
||||
return String::from(word);
|
||||
} else {
|
||||
return String::from(distances.candidate);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub fn correct_file(file_path: String, list_path: &str, output_path: String) {
|
||||
let re: Regex = Regex::new(r"[\w]+[\W]+").unwrap();
|
||||
let re2: Regex = Regex::new(r"(?P<specialfront>[\W&&[^\s]&&[^\n]]*)(?P<text>[\w]+)(?P<specialback>[\W&&[^\s]&&[^\n]]*)").unwrap();
|
||||
let mut input_file_lines: Vec<String> = Vec::new();
|
||||
@@ -53,6 +97,7 @@ pub mod utils {
|
||||
input_file_lines.push(line);
|
||||
});
|
||||
}
|
||||
|
||||
let mut input_words_by_line: Vec<Vec<String>> = Vec::new();
|
||||
for line in input_file_lines {
|
||||
let line_iter = line.split_whitespace();
|
||||
@@ -87,12 +132,12 @@ pub mod utils {
|
||||
let mut out_str: String = String::from("");
|
||||
out_str = out_str
|
||||
+ special_chars_front
|
||||
+ correct(String::from(text), list_path.clone()).as_str()
|
||||
+ correct(String::from(text), list_path).as_str()
|
||||
+ special_chars_back;
|
||||
output.push_str(out_str.as_str());
|
||||
output.push_str(" ");
|
||||
} else {
|
||||
output.push_str(correct(String::from(word), list_path.clone()).as_str());
|
||||
output.push_str(correct(String::from(word), list_path).as_str());
|
||||
output.push_str(" ");
|
||||
}
|
||||
}
|
||||
@@ -103,7 +148,77 @@ pub mod utils {
|
||||
Err(e) => panic!("Cannot write output file: {}", e),
|
||||
};
|
||||
match write!(output_file, "{}", output) {
|
||||
Ok(_) => exit(1),
|
||||
Ok(_) => (),
|
||||
Err(e) => panic!("Cannot write output file: {}", e),
|
||||
};
|
||||
}
|
||||
|
||||
pub fn correct_file_concurr(file_path: String, list_path: &str, output_path: String) {
|
||||
let re: Regex = Regex::new(r"[\w]+[\W]+").unwrap();
|
||||
let re2: Regex = Regex::new(r"(?P<specialfront>[\W&&[^\s]&&[^\n]]*)(?P<text>[\w]+)(?P<specialback>[\W&&[^\s]&&[^\n]]*)").unwrap();
|
||||
let mut input_file_lines: Vec<String> = Vec::new();
|
||||
if let Ok(lines) = read_lines(file_path) {
|
||||
lines.map_while(Result::ok).for_each(|line| {
|
||||
input_file_lines.push(line);
|
||||
});
|
||||
}
|
||||
let mut input_words_by_line: Vec<Vec<String>> = Vec::new();
|
||||
for line in input_file_lines {
|
||||
let line_iter = line.split_whitespace();
|
||||
let mut words: Vec<String> = Vec::new();
|
||||
for word in line_iter {
|
||||
if re.is_match(word) {
|
||||
let mut word_buffer: String;
|
||||
word_buffer =
|
||||
word.replace(&['(', '/', '\"', '\\', '<', '>', '»'], "");
|
||||
word_buffer = word_buffer.replace("ſ", "s");
|
||||
words.push(word_buffer);
|
||||
} else {
|
||||
let push_string = word.replace("ſ", "s");
|
||||
words.push(String::from(push_string));
|
||||
}
|
||||
}
|
||||
input_words_by_line.push(words);
|
||||
}
|
||||
let mut list_correct_words: Vec<String> = Vec::new();
|
||||
if let Ok(lines) = read_lines(list_path) {
|
||||
lines.map_while(Result::ok).for_each(|line| {
|
||||
list_correct_words.push(line);
|
||||
});
|
||||
}
|
||||
let mut output: String = String::new();
|
||||
let input_line_iter = input_words_by_line.iter();
|
||||
for line in input_line_iter {
|
||||
let input_word_iter = line.iter();
|
||||
for word in input_word_iter {
|
||||
let special_chars_front: &str;
|
||||
let special_chars_back: &str;
|
||||
let text: &str;
|
||||
if re.is_match(&word) {
|
||||
let captures = re2.captures(&word).unwrap();
|
||||
special_chars_front = &captures["specialfront"];
|
||||
special_chars_back = &captures["specialback"];
|
||||
text = &captures["text"];
|
||||
let mut out_str: String = String::from("");
|
||||
out_str = out_str
|
||||
+ special_chars_front
|
||||
+ correct_concurr(String::from(text), &list_correct_words).as_str()
|
||||
+ special_chars_back;
|
||||
output.push_str(out_str.as_str());
|
||||
output.push_str(" ");
|
||||
} else {
|
||||
output.push_str(correct_concurr(String::from(word), &list_correct_words).as_str());
|
||||
output.push_str(" ");
|
||||
}
|
||||
}
|
||||
output.push('\n');
|
||||
}
|
||||
let mut output_file: File = match File::create_new(output_path) {
|
||||
Ok(f) => f,
|
||||
Err(e) => panic!("Cannot write output file: {}", e),
|
||||
};
|
||||
match write!(output_file, "{}", output) {
|
||||
Ok(_) => (),
|
||||
Err(e) => panic!("Cannot write output file: {}", e),
|
||||
};
|
||||
}
|
||||
|
33
src/main.rs
33
src/main.rs
@@ -1,5 +1,7 @@
|
||||
use clap::{Args, Parser, Subcommand};
|
||||
use text_correction::utils;
|
||||
use std::time::SystemTime;
|
||||
use std::process::*;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "german word corrector")]
|
||||
@@ -13,7 +15,8 @@ struct Cli {
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
CorrectWord(WordArgs),
|
||||
CorrectFile(FileArgs)
|
||||
CorrectFile(FileArgs),
|
||||
BenchFile(FileArgs)
|
||||
}
|
||||
|
||||
#[derive(Args)]
|
||||
@@ -39,11 +42,35 @@ fn main() {
|
||||
|
||||
match &cli.command {
|
||||
Commands::CorrectWord(args) => {
|
||||
let out: String = utils::correct(args.input.clone(), args.list_path.clone());
|
||||
let out: String = utils::correct(args.input.clone(), args.list_path.as_str());
|
||||
println!("{}", out);
|
||||
},
|
||||
Commands::CorrectFile(args) => {
|
||||
utils::correct_file(args.input.clone(), args.list_path.clone(), args.output.clone())
|
||||
utils::correct_file_concurr(args.input.clone(), args.list_path.as_str(), args.output.clone())
|
||||
},
|
||||
Commands::BenchFile(args) => {
|
||||
let start_par = SystemTime::now();
|
||||
utils::correct_file_concurr(args.input.clone(), args.list_path.as_str(), args.output.clone());
|
||||
let stop_par = match start_par.elapsed() {
|
||||
Ok(elapsed) => elapsed.as_millis(),
|
||||
Err(e) => {
|
||||
println!("Error: {e:?}");
|
||||
exit(1);
|
||||
}
|
||||
};
|
||||
println!("Parallel processing took: {stop_par:?} ms");
|
||||
std::fs::remove_file(args.output.clone()).unwrap();
|
||||
let start = SystemTime::now();
|
||||
utils::correct_file(args.input.clone(), args.list_path.as_str(), args.output.clone());
|
||||
let stop = match start.elapsed() {
|
||||
Ok(elapsed) => elapsed.as_millis(),
|
||||
Err(e) => {
|
||||
println!("Error: {e:?}");
|
||||
exit(1);
|
||||
}
|
||||
};
|
||||
println!("Single-thread processing took: {stop:?} ms");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user