initial commit
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
/target
|
303
Cargo.lock
generated
Normal file
303
Cargo.lock
generated
Normal file
@@ -0,0 +1,303 @@
|
|||||||
|
# This file is automatically @generated by Cargo.
|
||||||
|
# It is not intended for manual editing.
|
||||||
|
version = 3
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstream"
|
||||||
|
version = "0.6.19"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933"
|
||||||
|
dependencies = [
|
||||||
|
"anstyle",
|
||||||
|
"anstyle-parse",
|
||||||
|
"anstyle-query",
|
||||||
|
"anstyle-wincon",
|
||||||
|
"colorchoice",
|
||||||
|
"is_terminal_polyfill",
|
||||||
|
"utf8parse",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle"
|
||||||
|
version = "1.0.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-parse"
|
||||||
|
version = "0.2.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
|
||||||
|
dependencies = [
|
||||||
|
"utf8parse",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-query"
|
||||||
|
version = "1.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9"
|
||||||
|
dependencies = [
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-wincon"
|
||||||
|
version = "3.0.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882"
|
||||||
|
dependencies = [
|
||||||
|
"anstyle",
|
||||||
|
"once_cell_polyfill",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap"
|
||||||
|
version = "4.5.41"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "be92d32e80243a54711e5d7ce823c35c41c9d929dc4ab58e1276f625841aadf9"
|
||||||
|
dependencies = [
|
||||||
|
"clap_builder",
|
||||||
|
"clap_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_builder"
|
||||||
|
version = "4.5.41"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "707eab41e9622f9139419d573eca0900137718000c517d47da73045f54331c3d"
|
||||||
|
dependencies = [
|
||||||
|
"anstream",
|
||||||
|
"anstyle",
|
||||||
|
"clap_lex",
|
||||||
|
"strsim",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_derive"
|
||||||
|
version = "4.5.41"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ef4f52386a59ca4c860f7393bcf8abd8dfd91ecccc0f774635ff68e92eeef491"
|
||||||
|
dependencies = [
|
||||||
|
"heck",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_lex"
|
||||||
|
version = "0.7.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "colorchoice"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fuzzt"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8a15f3d0fa42283a765e5fb609683ddab4ee4ff245d8db66a24d926c05e518c6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "heck"
|
||||||
|
version = "0.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "is_terminal_polyfill"
|
||||||
|
version = "1.70.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "levenshtein"
|
||||||
|
version = "1.0.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "db13adb97ab515a3691f56e4dbab09283d0b86cb45abd991d8634a9d6f501760"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memchr"
|
||||||
|
version = "2.7.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "once_cell_polyfill"
|
||||||
|
version = "1.70.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro2"
|
||||||
|
version = "1.0.95"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quote"
|
||||||
|
version = "1.0.40"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.11.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.4.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.8.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strsim"
|
||||||
|
version = "0.11.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "syn"
|
||||||
|
version = "2.0.104"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "text-correction"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"clap",
|
||||||
|
"fuzzt",
|
||||||
|
"levenshtein",
|
||||||
|
"regex",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-ident"
|
||||||
|
version = "1.0.18"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8parse"
|
||||||
|
version = "0.2.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-sys"
|
||||||
|
version = "0.59.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
|
||||||
|
dependencies = [
|
||||||
|
"windows-targets",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-targets"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||||
|
dependencies = [
|
||||||
|
"windows_aarch64_gnullvm",
|
||||||
|
"windows_aarch64_msvc",
|
||||||
|
"windows_i686_gnu",
|
||||||
|
"windows_i686_gnullvm",
|
||||||
|
"windows_i686_msvc",
|
||||||
|
"windows_x86_64_gnu",
|
||||||
|
"windows_x86_64_gnullvm",
|
||||||
|
"windows_x86_64_msvc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_gnullvm"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_msvc"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_gnu"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_gnullvm"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_msvc"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnu"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnullvm"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_msvc"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
10
Cargo.toml
Normal file
10
Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
[package]
|
||||||
|
name = "text-correction"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
clap = { version = "4.5.41", features = ["derive"] }
|
||||||
|
fuzzt = "0.3.1"
|
||||||
|
levenshtein = "1.0.5"
|
||||||
|
regex = "1.11.1"
|
81
src/lib.rs
Normal file
81
src/lib.rs
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
pub mod utils {
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{self, BufRead, Result};
|
||||||
|
use std::path::Path;
|
||||||
|
use fuzzt::algorithms::{ levenshtein, sorensen_dice};
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
pub fn correct(word: String, list_path: String) -> String {
|
||||||
|
let mut list_correct_words: Vec<String> = Vec::new();
|
||||||
|
if let Ok(lines) = read_lines(list_path) {
|
||||||
|
lines.map_while(Result::ok).for_each(|line| {
|
||||||
|
list_correct_words.push(line);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
let list_iter = list_correct_words.iter();
|
||||||
|
let mut has_match: bool = false;
|
||||||
|
for correct_word in list_iter.clone() {
|
||||||
|
if word.eq(correct_word) {
|
||||||
|
has_match = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if has_match {
|
||||||
|
return word;
|
||||||
|
} else {
|
||||||
|
let mut closest: &str = "";
|
||||||
|
let mut closest_dist: usize = 10000;
|
||||||
|
let mut closest_dist_sorensen: f64 = 0.0f64;
|
||||||
|
for correct_word in list_iter {
|
||||||
|
//println!("Checking {}, dist: {}, current closest: {}, (damerau: {}, Sørensen-Dice: {}, Jaro-Winkler: {})", correct_word.as_str(), levenshtein(word.as_str(), correct_word.as_str()), closest_dist, damerau_levenshtein(word.as_str(), correct_word.as_str()), sorensen_dice(word.as_str(), correct_word.as_str()), jaro_winkler(word.as_str(), correct_word.as_str()));
|
||||||
|
let dist = levenshtein(word.as_str(), correct_word.as_str());
|
||||||
|
if dist < closest_dist {
|
||||||
|
closest_dist = dist;
|
||||||
|
closest = correct_word;
|
||||||
|
closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str());
|
||||||
|
} else if dist == closest_dist {
|
||||||
|
if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen {
|
||||||
|
closest_dist = dist;
|
||||||
|
closest = correct_word;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return String::from(closest);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn correct_file (file_path: String, list_path: String, output_path: String) {
|
||||||
|
let re : Regex = Regex::new(r"[\w]+[\W]+").unwrap();
|
||||||
|
let mut input_file_lines: Vec<String> = Vec::new();
|
||||||
|
if let Ok(lines) = read_lines(file_path) {
|
||||||
|
lines.map_while(Result::ok).for_each(|line| {
|
||||||
|
input_file_lines.push(line);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
let mut input_file_words: Vec<Vec<String>> = Vec::new();
|
||||||
|
for line in input_file_lines {
|
||||||
|
let line_iter = line.split_whitespace();
|
||||||
|
let mut words: Vec<String> = Vec::new();
|
||||||
|
for word in line_iter {
|
||||||
|
|
||||||
|
if re.is_match(word) {
|
||||||
|
let mut word_buffer: String;
|
||||||
|
word_buffer = word.replace(&['(',')', '/', '\"', '\\', '<', '>', '*'], "");
|
||||||
|
word_buffer = word_buffer.replace("ſ", "s");
|
||||||
|
|
||||||
|
|
||||||
|
words.push(word_buffer);
|
||||||
|
} else {
|
||||||
|
words.push(String::from(word));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
input_file_words.push(words);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
|
||||||
|
where P: AsRef<Path>, {
|
||||||
|
let file = File::open(filename)?;
|
||||||
|
Ok(io::BufReader::new(file).lines())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
34
src/main.rs
Normal file
34
src/main.rs
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
use clap::{Args, Parser, Subcommand};
|
||||||
|
use text_correction::utils;
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(name = "german word corrector")]
|
||||||
|
#[command(version, about, long_about = None)]
|
||||||
|
#[command(next_line_help = true)]
|
||||||
|
struct Cli {
|
||||||
|
#[command(subcommand)]
|
||||||
|
command: Commands
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Subcommand)]
|
||||||
|
enum Commands {
|
||||||
|
CorrectWord(WordArgs)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Args)]
|
||||||
|
struct WordArgs {
|
||||||
|
#[arg(short,long)]
|
||||||
|
input: String,
|
||||||
|
list_path: String
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let cli: Cli = Cli::parse();
|
||||||
|
|
||||||
|
match &cli.command {
|
||||||
|
Commands::CorrectWord(args) => {
|
||||||
|
let out: String = utils::correct(args.input.clone(), args.list_path.clone());
|
||||||
|
println!("{}", out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
1908815
src/wordlist-german.txt
Normal file
1908815
src/wordlist-german.txt
Normal file
File diff suppressed because it is too large
Load Diff
1908816
wordlist-german.txt
Normal file
1908816
wordlist-german.txt
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user