I recently released an (other one) R package on CRAN - fuzzywuzzyR - which ports the fuzzywuzzy python library in R. “fuzzywuzzy does fuzzy string matching by using the Levenshtein Distance to calculate the differences between sequences (of character strings).”
The fuzzywuzzyR package includes R6-classes / functions for string matching,
FuzzExtract | FuzzMatcher | FuzzUtils | SequenceMatcher |
---|---|---|---|
Extract() | Partial_token_set_ratio() | Full_process() | ratio() |
ExtractBests() | Partial_token_sort_ratio() | Make_type_consistent() | quick_ratio() |
ExtractWithoutOrder() | Ratio() | Asciidammit() | real_quick_ratio() |
ExtractOne() | QRATIO() | Asciionly() | get_matching_blocks() |
WRATIO() | Validate_string() | get_opcodes() | |
UWRATIO() | |||
UQRATIO() | |||
Token_sort_ratio() | |||
Partial_ratio() | |||
Token_set_ratio() |
GetCloseMatches() |
---|
The following code chunks / examples are part of the package documentation and give an idea on what can be done with the fuzzywuzzyR package,
Each one of the methods in the FuzzExtract class takes a character string and a character string sequence as input ( except for the Dedupe method which takes a string sequence only ) and given a processor and a scorer it returns one or more string match(es) and the corresponding score ( in the range 0 - 100 ). Information about the additional parameters (limit, score_cutoff and threshold) can be found in the package documentation,
library(fuzzywuzzyR)
= "new york jets"
word
= c("Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys")
choices
#------------
# processor :
#------------
= FuzzUtils$new() # initialization of FuzzUtils class to choose a processor
init_proc
= init_proc$Full_process # processor-method
PROC
= tolower # base R function ( as an example for a processor )
PROC1
#---------
# scorer :
#---------
= FuzzMatcher$new() # initialization of the scorer class
init_scor
= init_scor$WRATIO # choosen scorer function
SCOR
<- FuzzExtract$new() # Initialization of the FuzzExtract class
init
$Extract(string = word, sequence_strings = choices, processor = PROC, scorer = SCOR) init
# example output
1]]
[[1]][[1]]
[[1] "New York Jets"
[
1]][[2]]
[[1] 100
[
2]]
[[2]][[1]]
[[1] "New York Giants"
[
2]][[2]]
[[1] 79
[
3]]
[[3]][[1]]
[[1] "Atlanta Falcons"
[
3]][[2]]
[[1] 29
[
4]]
[[4]][[1]]
[[1] "Dallas Cowboys"
[
4]][[2]]
[[1] 22 [
# extracts best matches (limited to 2 matches)
$ExtractBests(string = word, sequence_strings = choices, processor = PROC1,
init
scorer = SCOR, score_cutoff = 0L, limit = 2L)
1]]
[[1]][[1]]
[[1] "New York Jets"
[
1]][[2]]
[[1] 100
[
2]]
[[2]][[1]]
[[1] "New York Giants"
[
2]][[2]]
[[1] 79 [
# extracts matches without keeping the output order
$ExtractWithoutOrder(string = word, sequence_strings = choices, processor = PROC,
init
scorer = SCOR, score_cutoff = 0L)
1]]
[[1]][[1]]
[[1] "Atlanta Falcons"
[
1]][[2]]
[[1] 29
[
2]]
[[2]][[1]]
[[1] "New York Jets"
[
2]][[2]]
[[1] 100
[
3]]
[[3]][[1]]
[[1] "New York Giants"
[
3]][[2]]
[[1] 79
[
4]]
[[4]][[1]]
[[1] "Dallas Cowboys"
[
4]][[2]]
[[1] 22 [
# extracts first result
$ExtractOne(string = word, sequence_strings = choices, processor = PROC,
init
scorer = SCOR, score_cutoff = 0L)
1]]
[[1] "New York Jets"
[
2]]
[[1] 100 [
The dedupe method removes duplicates from a sequence of character strings using fuzzy string matching,
= c('Frodo Baggins', 'Tom Sawyer', 'Bilbo Baggin', 'Samuel L. Jackson',
duplicat
'F. Baggins', 'Frody Baggins', 'Bilbo Baggins')
$Dedupe(contains_dupes = duplicat, threshold = 70L, scorer = SCOR) init
1] "Frodo Baggins" "Samuel L. Jackson" "Bilbo Baggins" "Tom Sawyer" [
Each one of the methods in the FuzzMatcher class takes two character strings (string1, string2) as input and returns a score ( in range 0 to 100 ). Information about the additional parameters (force_ascii, full_process and threshold) can be found in the package documentation,
= "Atlanta Falcons"
s1
= "New York Jets"
s2
= FuzzMatcher$new() initialization of FuzzMatcher class
init
$Partial_token_set_ratio(string1 = s1, string2 = s2, force_ascii = TRUE, full_process = TRUE)
init
# example output
1] 31 [
$Partial_token_sort_ratio(string1 = s1, string2 = s2, force_ascii = TRUE, full_process = TRUE)
init
1] 31 [
$Ratio(string1 = s1, string2 = s2)
init
1] 21 [
$QRATIO(string1 = s1, string2 = s2, force_ascii = TRUE)
init
1] 29 [
$WRATIO(string1 = s1, string2 = s2, force_ascii = TRUE)
init
1] 29 [
$UWRATIO(string1 = s1, string2 = s2)
init
1] 29 [
$UQRATIO(string1 = s1, string2 = s2)
init
1] 29 [
$Token_sort_ratio(string1 = s1, string2 = s2, force_ascii = TRUE, full_process = TRUE)
init
1] 29 [
$Partial_ratio(string1 = s1, string2 = s2)
init
1] 23 [
$Token_set_ratio(string1 = s1, string2 = s2, force_ascii = TRUE, full_process = TRUE)
init
1] 29 [
The FuzzUtils class includes a number of utility methods, from which the Full_process method is from greater importance as besides its main functionality it can also be used as a secondary function in some of the other fuzzy matching classes,
= 'Frodo Baggins'
s1
= FuzzUtils$new()
init
$Full_process(string = s1, force_ascii = TRUE) init
# example output
1] "frodo baggins" [
The GetCloseMatches method returns a list of the best “good enough” matches. The parameter string is a sequence for which close matches are desired (typically a character string), and sequence_strings is a list of sequences against which to match the parameter string (typically a list of strings).
= c('Frodo Baggins', 'Tom Sawyer', 'Bilbo Baggin')
vec
= 'Fra Bagg'
str1
GetCloseMatches(string = str1, sequence_strings = vec, n = 2L, cutoff = 0.6)
1] "Frodo Baggins" [
The SequenceMatcher class is based on difflib which comes by default installed with python and includes the following fuzzy string matching methods,
= ' It was a dark and stormy night. I was all alone sitting on a red chair.'
s1
= ' It was a murky and stormy night. I was all alone sitting on a crimson chair.'
s2
= SequenceMatcher$new(string1 = s1, string2 = s2)
init
$ratio()
init
1] 0.9127517 [
$quick_ratio()
init
1] 0.9127517 [
$real_quick_ratio()
init
1] 0.966443 [
The get_matching_blocks and get_opcodes return triples and 5-tuples describing matching subsequences. More information can be found in the Python’s difflib module and in the fuzzywuzzyR package documentation.
A last think to note here is that the mentioned fuzzy string matching classes can be parallelized using the base R parallel package. For instance, the following MCLAPPLY_RATIOS function can take two vectors of character strings (QUERY1, QUERY2) and return the scores for each method of the FuzzMatcher class,
= function(QUERY1, QUERY2, class_fuzz = 'FuzzMatcher', method_fuzz = 'QRATIO', threads = 1, ...) {
MCLAPPLY_RATIOS
<- eval(parse(text = paste0(class_fuzz, '$new()')))
init
= paste0('init$', method_fuzz)
METHOD
if (threads == 1) {
= lapply(1:length(QUERY1), function(x) do.call(eval(parse(text = METHOD)), list(QUERY1[[x]], QUERY2[[x]], ...)))}
res_qrat
else {
= parallel::mclapply(1:length(QUERY1), function(x) do.call(eval(parse(text = METHOD)), list(QUERY1[[x]], QUERY2[[x]], ...)), mc.cores = threads)
res_qrat
}
return(res_qrat)
}
= c('word1', 'word2', 'word3')
query1
= c('similarword1', 'similar_word2', 'similarwor')
query2
= MCLAPPLY_RATIOS(query1, query2, class_fuzz = 'FuzzMatcher', method_fuzz = 'QRATIO', threads = 1)
quer_res
unlist(quer_res)
# example output
1] 59 56 40 [