initial commit
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| /target | ||||
							
								
								
									
										303
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										303
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,303 @@ | ||||
| # This file is automatically @generated by Cargo. | ||||
| # It is not intended for manual editing. | ||||
| version = 3 | ||||
|  | ||||
| [[package]] | ||||
| name = "aho-corasick" | ||||
| version = "1.1.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" | ||||
| dependencies = [ | ||||
|  "memchr", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "anstream" | ||||
| version = "0.6.19" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933" | ||||
| dependencies = [ | ||||
|  "anstyle", | ||||
|  "anstyle-parse", | ||||
|  "anstyle-query", | ||||
|  "anstyle-wincon", | ||||
|  "colorchoice", | ||||
|  "is_terminal_polyfill", | ||||
|  "utf8parse", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "anstyle" | ||||
| version = "1.0.11" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" | ||||
|  | ||||
| [[package]] | ||||
| name = "anstyle-parse" | ||||
| version = "0.2.7" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" | ||||
| dependencies = [ | ||||
|  "utf8parse", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "anstyle-query" | ||||
| version = "1.1.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" | ||||
| dependencies = [ | ||||
|  "windows-sys", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "anstyle-wincon" | ||||
| version = "3.0.9" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" | ||||
| dependencies = [ | ||||
|  "anstyle", | ||||
|  "once_cell_polyfill", | ||||
|  "windows-sys", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "clap" | ||||
| version = "4.5.41" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "be92d32e80243a54711e5d7ce823c35c41c9d929dc4ab58e1276f625841aadf9" | ||||
| dependencies = [ | ||||
|  "clap_builder", | ||||
|  "clap_derive", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "clap_builder" | ||||
| version = "4.5.41" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "707eab41e9622f9139419d573eca0900137718000c517d47da73045f54331c3d" | ||||
| dependencies = [ | ||||
|  "anstream", | ||||
|  "anstyle", | ||||
|  "clap_lex", | ||||
|  "strsim", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "clap_derive" | ||||
| version = "4.5.41" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "ef4f52386a59ca4c860f7393bcf8abd8dfd91ecccc0f774635ff68e92eeef491" | ||||
| dependencies = [ | ||||
|  "heck", | ||||
|  "proc-macro2", | ||||
|  "quote", | ||||
|  "syn", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "clap_lex" | ||||
| version = "0.7.5" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" | ||||
|  | ||||
| [[package]] | ||||
| name = "colorchoice" | ||||
| version = "1.0.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" | ||||
|  | ||||
| [[package]] | ||||
| name = "fuzzt" | ||||
| version = "0.3.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "8a15f3d0fa42283a765e5fb609683ddab4ee4ff245d8db66a24d926c05e518c6" | ||||
|  | ||||
| [[package]] | ||||
| name = "heck" | ||||
| version = "0.5.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" | ||||
|  | ||||
| [[package]] | ||||
| name = "is_terminal_polyfill" | ||||
| version = "1.70.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" | ||||
|  | ||||
| [[package]] | ||||
| name = "levenshtein" | ||||
| version = "1.0.5" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760" | ||||
|  | ||||
| [[package]] | ||||
| name = "memchr" | ||||
| version = "2.7.5" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" | ||||
|  | ||||
| [[package]] | ||||
| name = "once_cell_polyfill" | ||||
| version = "1.70.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" | ||||
|  | ||||
| [[package]] | ||||
| name = "proc-macro2" | ||||
| version = "1.0.95" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" | ||||
| dependencies = [ | ||||
|  "unicode-ident", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "quote" | ||||
| version = "1.0.40" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" | ||||
| dependencies = [ | ||||
|  "proc-macro2", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "regex" | ||||
| version = "1.11.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" | ||||
| dependencies = [ | ||||
|  "aho-corasick", | ||||
|  "memchr", | ||||
|  "regex-automata", | ||||
|  "regex-syntax", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "regex-automata" | ||||
| version = "0.4.9" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" | ||||
| dependencies = [ | ||||
|  "aho-corasick", | ||||
|  "memchr", | ||||
|  "regex-syntax", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "regex-syntax" | ||||
| version = "0.8.5" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" | ||||
|  | ||||
| [[package]] | ||||
| name = "strsim" | ||||
| version = "0.11.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" | ||||
|  | ||||
| [[package]] | ||||
| name = "syn" | ||||
| version = "2.0.104" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" | ||||
| dependencies = [ | ||||
|  "proc-macro2", | ||||
|  "quote", | ||||
|  "unicode-ident", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "text-correction" | ||||
| version = "0.1.0" | ||||
| dependencies = [ | ||||
|  "clap", | ||||
|  "fuzzt", | ||||
|  "levenshtein", | ||||
|  "regex", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "unicode-ident" | ||||
| version = "1.0.18" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" | ||||
|  | ||||
| [[package]] | ||||
| name = "utf8parse" | ||||
| version = "0.2.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" | ||||
|  | ||||
| [[package]] | ||||
| name = "windows-sys" | ||||
| version = "0.59.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" | ||||
| dependencies = [ | ||||
|  "windows-targets", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "windows-targets" | ||||
| version = "0.52.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" | ||||
| dependencies = [ | ||||
|  "windows_aarch64_gnullvm", | ||||
|  "windows_aarch64_msvc", | ||||
|  "windows_i686_gnu", | ||||
|  "windows_i686_gnullvm", | ||||
|  "windows_i686_msvc", | ||||
|  "windows_x86_64_gnu", | ||||
|  "windows_x86_64_gnullvm", | ||||
|  "windows_x86_64_msvc", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "windows_aarch64_gnullvm" | ||||
| version = "0.52.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" | ||||
|  | ||||
| [[package]] | ||||
| name = "windows_aarch64_msvc" | ||||
| version = "0.52.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" | ||||
|  | ||||
| [[package]] | ||||
| name = "windows_i686_gnu" | ||||
| version = "0.52.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" | ||||
|  | ||||
| [[package]] | ||||
| name = "windows_i686_gnullvm" | ||||
| version = "0.52.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" | ||||
|  | ||||
| [[package]] | ||||
| name = "windows_i686_msvc" | ||||
| version = "0.52.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" | ||||
|  | ||||
| [[package]] | ||||
| name = "windows_x86_64_gnu" | ||||
| version = "0.52.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" | ||||
|  | ||||
| [[package]] | ||||
| name = "windows_x86_64_gnullvm" | ||||
| version = "0.52.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" | ||||
|  | ||||
| [[package]] | ||||
| name = "windows_x86_64_msvc" | ||||
| version = "0.52.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" | ||||
							
								
								
									
										10
									
								
								Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | ||||
| [package] | ||||
| name = "text-correction" | ||||
| version = "0.1.0" | ||||
| edition = "2021" | ||||
|  | ||||
| [dependencies] | ||||
| clap = { version = "4.5.41", features = ["derive"] } | ||||
| fuzzt = "0.3.1" | ||||
| levenshtein = "1.0.5" | ||||
| regex = "1.11.1" | ||||
							
								
								
									
										81
									
								
								src/lib.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								src/lib.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,81 @@ | ||||
| pub mod utils { | ||||
|     use std::fs::File; | ||||
|     use std::io::{self, BufRead, Result}; | ||||
|     use std::path::Path; | ||||
|     use fuzzt::algorithms::{ levenshtein, sorensen_dice}; | ||||
|     use regex::Regex; | ||||
|      | ||||
|     pub fn correct(word: String, list_path: String) -> String { | ||||
|         let mut list_correct_words: Vec<String> = Vec::new(); | ||||
|         if let Ok(lines) = read_lines(list_path) { | ||||
|             lines.map_while(Result::ok).for_each(|line| { | ||||
|                 list_correct_words.push(line); | ||||
|             }); | ||||
|         } | ||||
|         let list_iter = list_correct_words.iter(); | ||||
|         let mut has_match: bool = false; | ||||
|         for correct_word in list_iter.clone() { | ||||
|             if word.eq(correct_word) { | ||||
|                 has_match = true; | ||||
|             } | ||||
|         } | ||||
|         if has_match { | ||||
|             return word; | ||||
|         } else { | ||||
|             let mut closest: &str = ""; | ||||
|             let mut closest_dist: usize = 10000; | ||||
|             let mut closest_dist_sorensen: f64 = 0.0f64; | ||||
|             for correct_word in list_iter { | ||||
|                 //println!("Checking {}, dist: {}, current closest: {}, (damerau: {}, Sørensen-Dice: {}, Jaro-Winkler: {})", correct_word.as_str(), levenshtein(word.as_str(), correct_word.as_str()), closest_dist, damerau_levenshtein(word.as_str(), correct_word.as_str()), sorensen_dice(word.as_str(), correct_word.as_str()), jaro_winkler(word.as_str(), correct_word.as_str())); | ||||
|                 let dist = levenshtein(word.as_str(), correct_word.as_str()); | ||||
|                 if dist < closest_dist { | ||||
|                     closest_dist = dist; | ||||
|                     closest = correct_word; | ||||
|                     closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str()); | ||||
|                 } else if dist == closest_dist { | ||||
|                     if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen { | ||||
|                         closest_dist = dist; | ||||
|                         closest = correct_word; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             return String::from(closest); | ||||
|         } | ||||
|     } | ||||
|      | ||||
|     pub fn correct_file (file_path: String, list_path: String, output_path: String)  { | ||||
|         let re : Regex = Regex::new(r"[\w]+[\W]+").unwrap(); | ||||
|         let mut input_file_lines: Vec<String> = Vec::new(); | ||||
|         if let Ok(lines) = read_lines(file_path) { | ||||
|             lines.map_while(Result::ok).for_each(|line| { | ||||
|                 input_file_lines.push(line); | ||||
|             }); | ||||
|         } | ||||
|         let mut input_file_words: Vec<Vec<String>> = Vec::new(); | ||||
|         for line in input_file_lines { | ||||
|             let line_iter = line.split_whitespace(); | ||||
|             let mut words: Vec<String> = Vec::new(); | ||||
|             for word in line_iter { | ||||
|                  | ||||
|                 if re.is_match(word) { | ||||
|                     let mut word_buffer: String; | ||||
|                     word_buffer = word.replace(&['(',')', '/', '\"', '\\', '<', '>', '*'], ""); | ||||
|                     word_buffer = word_buffer.replace("ſ", "s"); | ||||
|                      | ||||
|                      | ||||
|                     words.push(word_buffer); | ||||
|                 } else { | ||||
|                     words.push(String::from(word)); | ||||
|                 } | ||||
|             } | ||||
|             input_file_words.push(words); | ||||
|         } | ||||
|     } | ||||
|      | ||||
|     fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>> | ||||
|     where P: AsRef<Path>, { | ||||
|         let file = File::open(filename)?; | ||||
|         Ok(io::BufReader::new(file).lines()) | ||||
|     } | ||||
| } | ||||
|  | ||||
							
								
								
									
										34
									
								
								src/main.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								src/main.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | ||||
| use clap::{Args, Parser, Subcommand}; | ||||
| use text_correction::utils; | ||||
|  | ||||
| #[derive(Parser)] | ||||
| #[command(name = "german word corrector")] | ||||
| #[command(version, about, long_about = None)] | ||||
| #[command(next_line_help = true)] | ||||
| struct Cli { | ||||
|     #[command(subcommand)] | ||||
|     command: Commands | ||||
| } | ||||
|  | ||||
| #[derive(Subcommand)] | ||||
| enum Commands { | ||||
|     CorrectWord(WordArgs) | ||||
| } | ||||
|  | ||||
| #[derive(Args)] | ||||
| struct WordArgs { | ||||
|     #[arg(short,long)] | ||||
|     input: String, | ||||
|     list_path: String | ||||
| } | ||||
|  | ||||
| fn main() { | ||||
|     let cli: Cli = Cli::parse(); | ||||
|      | ||||
|     match &cli.command { | ||||
|         Commands::CorrectWord(args) => { | ||||
|             let out: String = utils::correct(args.input.clone(), args.list_path.clone()); | ||||
|             println!("{}", out); | ||||
|         } | ||||
|     } | ||||
| } | ||||
							
								
								
									
										1908815
									
								
								src/wordlist-german.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1908815
									
								
								src/wordlist-german.txt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										1908816
									
								
								wordlist-german.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1908816
									
								
								wordlist-german.txt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user