initial commit
This commit is contained in:
81
src/lib.rs
Normal file
81
src/lib.rs
Normal file
@@ -0,0 +1,81 @@
|
||||
pub mod utils {
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufRead, Result};
|
||||
use std::path::Path;
|
||||
use fuzzt::algorithms::{ levenshtein, sorensen_dice};
|
||||
use regex::Regex;
|
||||
|
||||
pub fn correct(word: String, list_path: String) -> String {
|
||||
let mut list_correct_words: Vec<String> = Vec::new();
|
||||
if let Ok(lines) = read_lines(list_path) {
|
||||
lines.map_while(Result::ok).for_each(|line| {
|
||||
list_correct_words.push(line);
|
||||
});
|
||||
}
|
||||
let list_iter = list_correct_words.iter();
|
||||
let mut has_match: bool = false;
|
||||
for correct_word in list_iter.clone() {
|
||||
if word.eq(correct_word) {
|
||||
has_match = true;
|
||||
}
|
||||
}
|
||||
if has_match {
|
||||
return word;
|
||||
} else {
|
||||
let mut closest: &str = "";
|
||||
let mut closest_dist: usize = 10000;
|
||||
let mut closest_dist_sorensen: f64 = 0.0f64;
|
||||
for correct_word in list_iter {
|
||||
//println!("Checking {}, dist: {}, current closest: {}, (damerau: {}, Sørensen-Dice: {}, Jaro-Winkler: {})", correct_word.as_str(), levenshtein(word.as_str(), correct_word.as_str()), closest_dist, damerau_levenshtein(word.as_str(), correct_word.as_str()), sorensen_dice(word.as_str(), correct_word.as_str()), jaro_winkler(word.as_str(), correct_word.as_str()));
|
||||
let dist = levenshtein(word.as_str(), correct_word.as_str());
|
||||
if dist < closest_dist {
|
||||
closest_dist = dist;
|
||||
closest = correct_word;
|
||||
closest_dist_sorensen = sorensen_dice(word.as_str(), correct_word.as_str());
|
||||
} else if dist == closest_dist {
|
||||
if sorensen_dice(word.as_str(), correct_word.as_str()) > closest_dist_sorensen {
|
||||
closest_dist = dist;
|
||||
closest = correct_word;
|
||||
}
|
||||
}
|
||||
}
|
||||
return String::from(closest);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn correct_file (file_path: String, list_path: String, output_path: String) {
|
||||
let re : Regex = Regex::new(r"[\w]+[\W]+").unwrap();
|
||||
let mut input_file_lines: Vec<String> = Vec::new();
|
||||
if let Ok(lines) = read_lines(file_path) {
|
||||
lines.map_while(Result::ok).for_each(|line| {
|
||||
input_file_lines.push(line);
|
||||
});
|
||||
}
|
||||
let mut input_file_words: Vec<Vec<String>> = Vec::new();
|
||||
for line in input_file_lines {
|
||||
let line_iter = line.split_whitespace();
|
||||
let mut words: Vec<String> = Vec::new();
|
||||
for word in line_iter {
|
||||
|
||||
if re.is_match(word) {
|
||||
let mut word_buffer: String;
|
||||
word_buffer = word.replace(&['(',')', '/', '\"', '\\', '<', '>', '*'], "");
|
||||
word_buffer = word_buffer.replace("ſ", "s");
|
||||
|
||||
|
||||
words.push(word_buffer);
|
||||
} else {
|
||||
words.push(String::from(word));
|
||||
}
|
||||
}
|
||||
input_file_words.push(words);
|
||||
}
|
||||
}
|
||||
|
||||
fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
|
||||
where P: AsRef<Path>, {
|
||||
let file = File::open(filename)?;
|
||||
Ok(io::BufReader::new(file).lines())
|
||||
}
|
||||
}
|
||||
|
34
src/main.rs
Normal file
34
src/main.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
use clap::{Args, Parser, Subcommand};
|
||||
use text_correction::utils;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "german word corrector")]
|
||||
#[command(version, about, long_about = None)]
|
||||
#[command(next_line_help = true)]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
command: Commands
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
CorrectWord(WordArgs)
|
||||
}
|
||||
|
||||
#[derive(Args)]
|
||||
struct WordArgs {
|
||||
#[arg(short,long)]
|
||||
input: String,
|
||||
list_path: String
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let cli: Cli = Cli::parse();
|
||||
|
||||
match &cli.command {
|
||||
Commands::CorrectWord(args) => {
|
||||
let out: String = utils::correct(args.input.clone(), args.list_path.clone());
|
||||
println!("{}", out);
|
||||
}
|
||||
}
|
||||
}
|
1908815
src/wordlist-german.txt
Normal file
1908815
src/wordlist-german.txt
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user