2 Commits
beta ... main

Author SHA1 Message Date
167036e144 minor changes 2025-07-16 03:08:16 +02:00
e1555e1432 introduced parallel processing of iterator with rayon 2025-07-16 02:35:57 +02:00
5 changed files with 394 additions and 38 deletions

157
Cargo.lock generated
View File

@@ -89,10 +89,10 @@ version = "4.5.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef4f52386a59ca4c860f7393bcf8abd8dfd91ecccc0f774635ff68e92eeef491"
dependencies = [
"heck",
"heck 0.5.0",
"proc-macro2",
"quote",
"syn",
"syn 2.0.104",
]
[[package]]
@@ -107,36 +107,113 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "either"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "fuzzt"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a15f3d0fa42283a765e5fb609683ddab4ee4ff245d8db66a24d926c05e518c6"
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "levenshtein"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
[[package]]
name = "libc"
version = "0.2.174"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
[[package]]
name = "log"
version = "0.4.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
name = "maplit"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "memchr"
version = "2.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
[[package]]
name = "num_cpus"
version = "1.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "once_cell_polyfill"
version = "1.70.1"
@@ -161,6 +238,26 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rayon"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "regex"
version = "1.11.1"
@@ -190,12 +287,62 @@ version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "rustversion"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d"
[[package]]
name = "spinners"
version = "4.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0ef947f358b9c238923f764c72a4a9d42f2d637c46e059dbd319d6e7cfb4f82"
dependencies = [
"lazy_static",
"maplit",
"strum",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "strum"
version = "0.24.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"
dependencies = [
"strum_macros",
]
[[package]]
name = "strum_macros"
version = "0.24.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
dependencies = [
"heck 0.4.1",
"proc-macro2",
"quote",
"rustversion",
"syn 1.0.109",
]
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "syn"
version = "2.0.104"
@@ -209,12 +356,16 @@ dependencies = [
[[package]]
name = "text-correction"
version = "0.1.0"
version = "0.1.1"
dependencies = [
"clap",
"fuzzt",
"levenshtein",
"log",
"num_cpus",
"rayon",
"regex",
"spinners",
]
[[package]]

View File

@@ -1,10 +1,23 @@
[package]
name = "text-correction"
version = "0.1.0"
version = "0.1.1"
edition = "2021"
license-file = "LICENSE"
readme = "README.md"
description = "Small CLI tool that corrects a given input file based on a word list given as input."
authors = ["nihil carcosa <nihil@valhrafnaz.gay>"]
homepage = "https://valhrafnaz.gay"
repository = "https://git.valhrafnaz.gay/valhrafnaz/text-correction"
categories = ["command-line-utilities"]
keywords = ["text-processing"]
publish = ["gitea"]
[dependencies]
clap = { version = "4.5.41", features = ["derive"] }
fuzzt = "0.3.1"
levenshtein = "1.0.5"
log = "0.4.27"
num_cpus = "1.17.0"
rayon = "1.10.0"
regex = "1.11.1"
spinners = "4.1.1"

32
output.txt Normal file
View File

@@ -0,0 +1,32 @@
Vorrede,
Ein schönes Wort jenes Wort des Propheten: Thuet
Salz darein!“
Als zu dem Propheten Elisa die Männer von Jericho kam
amen und klagten, daß das Wasser der Stadt böse und das Land
unfruchtbar sey, sprach er: Bringet mir her eine neue Schaale
und thut Salz darein!“ und sie brachten es ihm Hz da ging
er hinaus zu der Wasserquelle und warf das Salz hinein und
machte sie mit dem Worte des Herrn gesund*).
unser theurer Krummacher hat es am lebhaftesten gefühlt,
daß dieses prophetische runder symbolische Bedeutung hat und
geistlich zu allen Zeiten in der Gemeinde wiederholt werden muß.
wenigstens müssen wir beständig um die geistliche Erneuerung
desselben flehen. Wirt sehnen uns danach, wir erflehen sie. auch
unsere Brunnen sind abgestanden, faul, vergiftet und hauchen
den Tod aus, der nicht eine Stadt und Gegend, sondern eine
Welt zu verderben droht, und in unser Tagens schrecklicher als
jemals vorher wütet. Gene vergifteten Brunnen sind die bis-
sesshaften, die in schrecklicher Loßgebundenheit von allem
göttlichen mit ihren selbstgemachten Gesetzen das gesamte Unis-
versus zu umschließen sich anmaßen die Künste, die ihrem
ursprünglichen Beruf, Weissagerinnen zu seyn vom jenseits,
hohnlachend Chalet gegeben haben, um die Sünde mit dem Glanze
der Verklärung zu umwerben Hz; eine Theologie, die aus dem
Eignens redet, wie der Vater der Lügens, und die inwendig ca-
kanaanitisch gesinnt ficht, den Leviten AA Rob heuchlerisch) umgeworfen
hat; eine Philosophie, welche das Nichte- des Allee-
Hegel übe. Kunst au. Re. AA

View File

@@ -1,12 +1,14 @@
pub mod utils {
use std::cell::OnceCell;
use std::cmp::Ordering;
use fuzzt::algorithms::{levenshtein, sorensen_dice};
use regex::Regex;
use std::fs::File;
use std::io::{self, BufRead, Result, Write};
use std::path::Path;
use std::process::exit;
use rayon::prelude::*;
pub fn correct(word: String, list_path: String) -> String {
pub fn correct(word: String, list_path: &str) -> String {
let mut list_correct_words: Vec<String> = Vec::new();
if let Ok(lines) = read_lines(list_path) {
lines.map_while(Result::ok).for_each(|line| {
@@ -14,20 +16,10 @@ pub mod utils {
});
}
let list_iter = list_correct_words.iter();
let mut has_match: bool = false;
for correct_word in list_iter.clone() {
if word.eq(correct_word) || word.to_lowercase().eq(correct_word) {
has_match = true;
}
}
if has_match {
return word;
} else {
let mut closest: &str = "";
let mut closest_dist: usize = 10000;
let mut closest_dist_sorensen: f64 = 0.0f64;
for correct_word in list_iter {
//println!("Checking {}, dist: {}, current closest: {}, (damerau: {}, Sørensen-Dice: {}, Jaro-Winkler: {})", correct_word.as_str(), levenshtein(word.as_str(), correct_word.as_str()), closest_dist, damerau_levenshtein(word.as_str(), correct_word.as_str()), sorensen_dice(word.as_str(), correct_word.as_str()), jaro_winkler(word.as_str(), correct_word.as_str()));
let dist = levenshtein(word.as_str(), correct_word.as_str());
if dist < closest_dist {
closest_dist = dist;
@@ -40,11 +32,63 @@ pub mod utils {
}
}
}
if closest_dist == 0 {
return String::from(word);
} else {
return String::from(closest);
}
}
pub fn correct_file(file_path: String, list_path: String, output_path: String) {
struct Distances<'a> {
target: &'a str,
candidate: &'a str,
levenshtein: usize,
sorensen: OnceCell<f64>,
}
impl<'a> Distances<'a> {
fn new(target: &'a str, candidate: &'a str) -> Self {
Self {
target,
candidate,
levenshtein: levenshtein(target, candidate),
sorensen: OnceCell::new(),
}
}
fn calc_sorensen(&self) -> &f64 {
self.sorensen
.get_or_init(|| sorensen_dice(self.target, self.candidate))
}
fn cmp(&self, other: &Self) -> Ordering {
assert_eq!(self.target, other.target);
match usize::cmp(&self.levenshtein, &other.levenshtein) {
Ordering::Less => Ordering::Less,
Ordering::Equal => {
// intentionally reverse order here
f64::total_cmp(other.calc_sorensen(), self.calc_sorensen())
}
Ordering::Greater => Ordering::Greater,
}
}
}
fn find<'a>(target: &'a str, candidates: &'a [String]) -> Option<Distances<'a>> {
candidates
.par_iter() // or .iter() for single-threaded
.map(|candidate| Distances::new(target, candidate))
.min_by(Distances::cmp)
}
pub fn correct_concurr(word: String, word_list: &[String]) -> String {
let distances = find(&word,word_list).unwrap();
if distances.levenshtein == 0 {
return String::from(word);
} else {
return String::from(distances.candidate);
}
}
pub fn correct_file(file_path: String, list_path: &str, output_path: String) {
let re: Regex = Regex::new(r"[\w]+[\W]+").unwrap();
let re2: Regex = Regex::new(r"(?P<specialfront>[\W&&[^\s]&&[^\n]]*)(?P<text>[\w]+)(?P<specialback>[\W&&[^\s]&&[^\n]]*)").unwrap();
let mut input_file_lines: Vec<String> = Vec::new();
@@ -53,6 +97,7 @@ pub mod utils {
input_file_lines.push(line);
});
}
let mut input_words_by_line: Vec<Vec<String>> = Vec::new();
for line in input_file_lines {
let line_iter = line.split_whitespace();
@@ -87,12 +132,12 @@ pub mod utils {
let mut out_str: String = String::from("");
out_str = out_str
+ special_chars_front
+ correct(String::from(text), list_path.clone()).as_str()
+ correct(String::from(text), list_path).as_str()
+ special_chars_back;
output.push_str(out_str.as_str());
output.push_str(" ");
} else {
output.push_str(correct(String::from(word), list_path.clone()).as_str());
output.push_str(correct(String::from(word), list_path).as_str());
output.push_str(" ");
}
}
@@ -103,7 +148,77 @@ pub mod utils {
Err(e) => panic!("Cannot write output file: {}", e),
};
match write!(output_file, "{}", output) {
Ok(_) => exit(1),
Ok(_) => (),
Err(e) => panic!("Cannot write output file: {}", e),
};
}
pub fn correct_file_concurr(file_path: String, list_path: &str, output_path: String) {
let re: Regex = Regex::new(r"[\w]+[\W]+").unwrap();
let re2: Regex = Regex::new(r"(?P<specialfront>[\W&&[^\s]&&[^\n]]*)(?P<text>[\w]+)(?P<specialback>[\W&&[^\s]&&[^\n]]*)").unwrap();
let mut input_file_lines: Vec<String> = Vec::new();
if let Ok(lines) = read_lines(file_path) {
lines.map_while(Result::ok).for_each(|line| {
input_file_lines.push(line);
});
}
let mut input_words_by_line: Vec<Vec<String>> = Vec::new();
for line in input_file_lines {
let line_iter = line.split_whitespace();
let mut words: Vec<String> = Vec::new();
for word in line_iter {
if re.is_match(word) {
let mut word_buffer: String;
word_buffer =
word.replace(&['(', '/', '\"', '\\', '<', '>', '»'], "");
word_buffer = word_buffer.replace("ſ", "s");
words.push(word_buffer);
} else {
let push_string = word.replace("ſ", "s");
words.push(String::from(push_string));
}
}
input_words_by_line.push(words);
}
let mut list_correct_words: Vec<String> = Vec::new();
if let Ok(lines) = read_lines(list_path) {
lines.map_while(Result::ok).for_each(|line| {
list_correct_words.push(line);
});
}
let mut output: String = String::new();
let input_line_iter = input_words_by_line.iter();
for line in input_line_iter {
let input_word_iter = line.iter();
for word in input_word_iter {
let special_chars_front: &str;
let special_chars_back: &str;
let text: &str;
if re.is_match(&word) {
let captures = re2.captures(&word).unwrap();
special_chars_front = &captures["specialfront"];
special_chars_back = &captures["specialback"];
text = &captures["text"];
let mut out_str: String = String::from("");
out_str = out_str
+ special_chars_front
+ correct_concurr(String::from(text), &list_correct_words).as_str()
+ special_chars_back;
output.push_str(out_str.as_str());
output.push_str(" ");
} else {
output.push_str(correct_concurr(String::from(word), &list_correct_words).as_str());
output.push_str(" ");
}
}
output.push('\n');
}
let mut output_file: File = match File::create_new(output_path) {
Ok(f) => f,
Err(e) => panic!("Cannot write output file: {}", e),
};
match write!(output_file, "{}", output) {
Ok(_) => (),
Err(e) => panic!("Cannot write output file: {}", e),
};
}

View File

@@ -1,8 +1,13 @@
use clap::{Args, Parser, Subcommand};
use text_correction::utils;
use std::time::SystemTime;
use std::process::*;
use std::path::Path;
use spinners::{Spinner, Spinners};
use log::{error, trace};
#[derive(Parser)]
#[command(name = "german word corrector")]
#[command(name = "word corrector")]
#[command(version, about, long_about = None)]
#[command(next_line_help = true)]
struct Cli {
@@ -13,7 +18,8 @@ struct Cli {
#[derive(Subcommand)]
enum Commands {
CorrectWord(WordArgs),
CorrectFile(FileArgs)
CorrectFile(FileArgs),
BenchFile(FileArgs)
}
#[derive(Args)]
@@ -31,7 +37,9 @@ struct FileArgs {
#[arg(short,long)]
list_path: String,
#[arg(short,long)]
output: String
output: String,
#[arg(short,long)]
overwrite: bool
}
fn main() {
@@ -39,11 +47,48 @@ fn main() {
match &cli.command {
Commands::CorrectWord(args) => {
let out: String = utils::correct(args.input.clone(), args.list_path.clone());
let out: String = utils::correct(args.input.clone(), args.list_path.as_str());
println!("{}", out);
},
Commands::CorrectFile(args) => {
utils::correct_file(args.input.clone(), args.list_path.clone(), args.output.clone())
if args.overwrite == true {
trace!("Overwriting old file due to cli argument.");
std::fs::remove_file(args.output.clone()).unwrap();
} else {
trace!("Checking whether destination is writable.");
let path = "./".to_owned() + args.output.as_str();
if Path::new(path.as_str()).exists() {
error!("File already exists!");
exit(1);
}
}
let mut sp = Spinner::new(Spinners::Dots, "Processing file...".into());
utils::correct_file_concurr(args.input.clone(), args.list_path.as_str(), args.output.clone());
sp.stop_with_message("".into());
},
Commands::BenchFile(args) => {
let start_par = SystemTime::now();
utils::correct_file_concurr(args.input.clone(), args.list_path.as_str(), args.output.clone());
let stop_par = match start_par.elapsed() {
Ok(elapsed) => elapsed.as_millis(),
Err(e) => {
println!("Error: {e:?}");
exit(1);
}
};
println!("Parallel processing took: {stop_par:?} ms");
std::fs::remove_file(args.output.clone()).unwrap();
let start = SystemTime::now();
utils::correct_file(args.input.clone(), args.list_path.as_str(), args.output.clone());
let stop = match start.elapsed() {
Ok(elapsed) => elapsed.as_millis(),
Err(e) => {
println!("Error: {e:?}");
exit(1);
}
};
println!("Single-thread processing took: {stop:?} ms");
}
}
}