initial commit

This commit is contained in:
2025-07-13 20:01:05 +02:00
commit 2ea546e037
8 changed files with 3818060 additions and 0 deletions

81
src/lib.rs Normal file
View File

@@ -0,0 +1,81 @@
pub mod utils {
use std::fs::File;
use std::io::{self, BufRead, Result};
use std::path::Path;
use fuzzt::algorithms::{ levenshtein, sorensen_dice};
use regex::Regex;
pub fn correct(word: String, list_path: String) -> String {
let mut list_correct_words: Vec<String> = Vec::new();
if let Ok(lines) = read_lines(list_path) {
lines.map_while(Result::ok).for_each(|line| {
list_correct_words.push(line);
});
}
let list_iter = list_correct_words.iter();
let mut has_match: bool = false;
for correct_word in list_iter.clone() {
if word.eq(correct_word) {
has_match = true;
}
}
if has_match {
return word;
} else {
let mut closest: &str = "";
let mut closest_dist: usize = 10000;
let mut closest_dist_sorensen: f64 = 0.0f64;
for correct_word in list_iter {
//println!("Checking {}, dist: {}, current closest: {}, (damerau: {}, Sørensen-Dice: {}, Jaro-Winkler: {})", correct_word.as_str(), levenshtein(word.as_str(), correct_word.as_str()), closest_dist, damerau_levenshtein(word.as_str(), correct_word.as_str()), sorensen_dice(word.as_str(), correct_word.as_str()), jaro_winkler(word.as_str(), correct_word.as_str()));
let dist = levenshtein(word.as_str(), correct_word.as_str());
if dist < closest_dist {
closest_dist = dist;
closest = correct_word;
closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str());
} else if dist == closest_dist {
if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen {
closest_dist = dist;
closest = correct_word;
}
}
}
return String::from(closest);
}
}
pub fn correct_file (file_path: String, list_path: String, output_path: String) {
let re : Regex = Regex::new(r"[\w]+[\W]+").unwrap();
let mut input_file_lines: Vec<String> = Vec::new();
if let Ok(lines) = read_lines(file_path) {
lines.map_while(Result::ok).for_each(|line| {
input_file_lines.push(line);
});
}
let mut input_file_words: Vec<Vec<String>> = Vec::new();
for line in input_file_lines {
let line_iter = line.split_whitespace();
let mut words: Vec<String> = Vec::new();
for word in line_iter {
if re.is_match(word) {
let mut word_buffer: String;
word_buffer = word.replace(&['(',')', '/', '\"', '\\', '<', '>', '*'], "");
word_buffer = word_buffer.replace("ſ", "s");
words.push(word_buffer);
} else {
words.push(String::from(word));
}
}
input_file_words.push(words);
}
}
fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
where P: AsRef<Path>, {
let file = File::open(filename)?;
Ok(io::BufReader::new(file).lines())
}
}

34
src/main.rs Normal file
View File

@@ -0,0 +1,34 @@
use clap::{Args, Parser, Subcommand};
use text_correction::utils;
#[derive(Parser)]
#[command(name = "german word corrector")]
#[command(version, about, long_about = None)]
#[command(next_line_help = true)]
struct Cli {
#[command(subcommand)]
command: Commands
}
#[derive(Subcommand)]
enum Commands {
CorrectWord(WordArgs)
}
#[derive(Args)]
struct WordArgs {
#[arg(short,long)]
input: String,
list_path: String
}
fn main() {
let cli: Cli = Cli::parse();
match &cli.command {
Commands::CorrectWord(args) => {
let out: String = utils::correct(args.input.clone(), args.list_path.clone());
println!("{}", out);
}
}
}

1908815
src/wordlist-german.txt Normal file

File diff suppressed because it is too large Load Diff