initial commit

2025-07-13 20:01:05 +02:00
commit 2ea546e037
8 changed files with 3818060 additions and 0 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -0,0 +1,81 @@
+pub mod utils {
+    use std::fs::File;
+    use std::io::{self, BufRead, Result};
+    use std::path::Path;
+    use fuzzt::algorithms::{ levenshtein, sorensen_dice};
+    use regex::Regex;
+    
+    pub fn correct(word: String, list_path: String) -> String {
+        let mut list_correct_words: Vec<String> = Vec::new();
+        if let Ok(lines) = read_lines(list_path) {
+            lines.map_while(Result::ok).for_each(|line| {
+                list_correct_words.push(line);
+            });
+        }
+        let list_iter = list_correct_words.iter();
+        let mut has_match: bool = false;
+        for correct_word in list_iter.clone() {
+            if word.eq(correct_word) {
+                has_match = true;
+            }
+        }
+        if has_match {
+            return word;
+        } else {
+            let mut closest: &str = "";
+            let mut closest_dist: usize = 10000;
+            let mut closest_dist_sorensen: f64 = 0.0f64;
+            for correct_word in list_iter {
+                //println!("Checking {}, dist: {}, current closest: {}, (damerau: {}, Sørensen-Dice: {}, Jaro-Winkler: {})", correct_word.as_str(), levenshtein(word.as_str(), correct_word.as_str()), closest_dist, damerau_levenshtein(word.as_str(), correct_word.as_str()), sorensen_dice(word.as_str(), correct_word.as_str()), jaro_winkler(word.as_str(), correct_word.as_str()));
+                let dist = levenshtein(word.as_str(), correct_word.as_str());
+                if dist < closest_dist {
+                    closest_dist = dist;
+                    closest = correct_word;
+                    closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str());
+                } else if dist == closest_dist {
+                    if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen {
+                        closest_dist = dist;
+                        closest = correct_word;
+                    }
+                }
+            }
+            return String::from(closest);
+        }
+    }
+    
+    pub fn correct_file (file_path: String, list_path: String, output_path: String)  {
+        let re : Regex = Regex::new(r"[\w]+[\W]+").unwrap();
+        let mut input_file_lines: Vec<String> = Vec::new();
+        if let Ok(lines) = read_lines(file_path) {
+            lines.map_while(Result::ok).for_each(|line| {
+                input_file_lines.push(line);
+            });
+        }
+        let mut input_file_words: Vec<Vec<String>> = Vec::new();
+        for line in input_file_lines {
+            let line_iter = line.split_whitespace();
+            let mut words: Vec<String> = Vec::new();
+            for word in line_iter {
+                
+                if re.is_match(word) {
+                    let mut word_buffer: String;
+                    word_buffer = word.replace(&['(',')', '/', '\"', '\\', '<', '>', '*'], "");
+                    word_buffer = word_buffer.replace("ſ", "s");
+                    
+                    
+                    words.push(word_buffer);
+                } else {
+                    words.push(String::from(word));
+                }
+            }
+            input_file_words.push(words);
+        }
+    }
+    
+    fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
+    where P: AsRef<Path>, {
+        let file = File::open(filename)?;
+        Ok(io::BufReader::new(file).lines())
+    }
+}
+
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,34 @@
+use clap::{Args, Parser, Subcommand};
+use text_correction::utils;
+
+#[derive(Parser)]
+#[command(name = "german word corrector")]
+#[command(version, about, long_about = None)]
+#[command(next_line_help = true)]
+struct Cli {
+    #[command(subcommand)]
+    command: Commands
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    CorrectWord(WordArgs)
+}
+
+#[derive(Args)]
+struct WordArgs {
+    #[arg(short,long)]
+    input: String,
+    list_path: String
+}
+
+fn main() {
+    let cli: Cli = Cli::parse();
+    
+    match &cli.command {
+        Commands::CorrectWord(args) => {
+            let out: String = utils::correct(args.input.clone(), args.list_path.clone());
+            println!("{}", out);
+        }
+    }
+}
--- a/src/wordlist-german.txt
+++ b/src/wordlist-german.txt