Always use most accurate way of fuzzy matching, greatly improve performance of fuzzy matching

This commit is contained in:
Aaron Wood 2024-01-31 18:59:23 -08:00
parent fb5200e4d2
commit 10d2079f69
2 changed files with 15 additions and 89 deletions

View file

@ -1,5 +1,5 @@
#!/usr/bin/with-contenv bash #!/usr/bin/with-contenv bash
scriptVersion="2.30" scriptVersion="2.31"
scriptName="Audio" scriptName="Audio"
### Import Settings ### Import Settings
@ -1269,7 +1269,7 @@ SearchProcess () {
releaseProcessCount=$(( $releaseProcessCount + 1)) releaseProcessCount=$(( $releaseProcessCount + 1))
lidarrReleaseTitle="${lidarrReleaseTitles[$title]}" lidarrReleaseTitle="${lidarrReleaseTitles[$title]}"
lidarrAlbumReleaseTitleClean=$(echo "$lidarrReleaseTitle" | sed -e "s%[^[:alpha:][:digit:]]%%g" -e "s/ */ /g" | sed 's/^[.]*//' | sed 's/[.]*$//g' | sed 's/^ *//g' | sed 's/ *$//g') lidarrAlbumReleaseTitleClean=$(echo "$lidarrReleaseTitle" | sed -e "s%[^[:alpha:][:digit:]]%%g" -e "s/ */ /g" | sed 's/^[.]*//' | sed 's/[.]*$//g' | sed 's/^ *//g' | sed 's/ *$//g')
lidarrAlbumReleaseTitleClean="${lidarrAlbumReleaseTitleClean:0:130}" lidarrAlbumReleaseTitleClean="${lidarrAlbumReleaseTitleClean:0:130}"
lidarrAlbumReleaseTitleSearchClean="$(echo "$lidarrReleaseTitle" | sed -e "s%[^[:alpha:][:digit:]]% %g" -e "s/ */ /g" | sed 's/^[.]*//' | sed 's/[.]*$//g' | sed 's/^ *//g' | sed 's/ *$//g')" lidarrAlbumReleaseTitleSearchClean="$(echo "$lidarrReleaseTitle" | sed -e "s%[^[:alpha:][:digit:]]% %g" -e "s/ */ /g" | sed 's/^[.]*//' | sed 's/[.]*$//g' | sed 's/^ *//g' | sed 's/ *$//g')"
lidarrAlbumReleaseTitleFirstWord="$(echo "$lidarrReleaseTitle" | awk '{ print $1 }')" lidarrAlbumReleaseTitleFirstWord="$(echo "$lidarrReleaseTitle" | awk '{ print $1 }')"
lidarrAlbumReleaseTitleFirstWord="${lidarrAlbumReleaseTitleFirstWord:0:3}" lidarrAlbumReleaseTitleFirstWord="${lidarrAlbumReleaseTitleFirstWord:0:3}"
@ -1433,15 +1433,6 @@ ArtistDeezerSearch () {
deezerAlbumTitle="$(echo "$deezerAlbumData" | jq -r ".title")" deezerAlbumTitle="$(echo "$deezerAlbumData" | jq -r ".title")"
deezerAlbumTitleClean="$(echo ${deezerAlbumTitle} | sed -e "s%[^[:alpha:][:digit:]]%%g" -e "s/ */ /g" | sed 's/^[.]*//' | sed 's/[.]*$//g' | sed 's/^ *//g' | sed 's/ *$//g')" deezerAlbumTitleClean="$(echo ${deezerAlbumTitle} | sed -e "s%[^[:alpha:][:digit:]]%%g" -e "s/ */ /g" | sed 's/^[.]*//' | sed 's/[.]*$//g' | sed 's/^ *//g' | sed 's/ *$//g')"
deezerAlbumTitleClean="${deezerAlbumTitleClean:0:130}" deezerAlbumTitleClean="${deezerAlbumTitleClean:0:130}"
# String Character Count test, quicker than the levenshtein method to allow faster processing
characterMath=$(( ${#deezerAlbumTitleClean} - ${#lidarrAlbumReleaseTitleClean} ))
if [ "$characterMath" -gt "$matchDistance" ]; then
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Not a match..."
continue
elif [ "$characterMath" -lt "0" ]; then
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Not a match..."
continue
fi
GetDeezerAlbumInfo "$deezerAlbumID" GetDeezerAlbumInfo "$deezerAlbumID"
deezerAlbumData="$(cat "/config/extended/cache/deezer/$deezerAlbumID.json")" deezerAlbumData="$(cat "/config/extended/cache/deezer/$deezerAlbumID.json")"
deezerAlbumTrackCount="$(echo "$deezerAlbumData" | jq -r .nb_tracks)" deezerAlbumTrackCount="$(echo "$deezerAlbumData" | jq -r .nb_tracks)"
@ -1460,8 +1451,8 @@ ArtistDeezerSearch () {
fi fi
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Checking for Match..." log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Checking for Match..."
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Calculating Similarity..." log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Calculating Damerau-Levenshtein distance..."
diff=$(levenshtein "${lidarrAlbumReleaseTitleClean,,}" "${deezerAlbumTitleClean,,}" 2>/dev/null) diff=$(python -c "from pyxdameraulevenshtein import damerau_levenshtein_distance; print(damerau_levenshtein_distance(\"${lidarrAlbumReleaseTitleClean,,}\", \"${deezerAlbumTitleClean,,}\"))" 2>/dev/null)
if [ "$diff" -le "$matchDistance" ]; then if [ "$diff" -le "$matchDistance" ]; then
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Deezer MATCH Found :: Calculated Difference = $diff" log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Deezer MATCH Found :: Calculated Difference = $diff"
@ -1481,7 +1472,7 @@ ArtistDeezerSearch () {
FuzzyDeezerSearch () { FuzzyDeezerSearch () {
# Required Inputs # Required Inputs
# $1 Process ID # $1 Process ID
# $3 Lyric Type (explicit = true, clean = false) # $2 Lyric Type (explicit = true, clean = false)
if [ "$2" == "true" ]; then if [ "$2" == "true" ]; then
type="Explicit" type="Explicit"
@ -1512,13 +1503,6 @@ FuzzyDeezerSearch () {
deezerAlbumTitle="$(echo "$deezerAlbumTitle" | head -n1)" deezerAlbumTitle="$(echo "$deezerAlbumTitle" | head -n1)"
deezerAlbumTitleClean="$(echo "$deezerAlbumTitle" | sed -e "s%[^[:alpha:][:digit:]]%%g" -e "s/ */ /g" | sed 's/^[.]*//' | sed 's/[.]*$//g' | sed 's/^ *//g' | sed 's/ *$//g')" deezerAlbumTitleClean="$(echo "$deezerAlbumTitle" | sed -e "s%[^[:alpha:][:digit:]]%%g" -e "s/ */ /g" | sed 's/^[.]*//' | sed 's/[.]*$//g' | sed 's/^ *//g' | sed 's/ *$//g')"
deezerAlbumTitleClean="${deezerAlbumTitleClean:0:130}" deezerAlbumTitleClean="${deezerAlbumTitleClean:0:130}"
# String Character Count test, quicker than the levenshtein method to allow faster processing
characterMath=$(( ${#deezerAlbumTitleClean} - ${#lidarrAlbumReleaseTitleClean} ))
if [ "$characterMath" -gt "$matchDistance" ]; then
continue
elif [ "$characterMath" -lt "0" ]; then
continue
fi
GetDeezerAlbumInfo "${deezerAlbumID}" GetDeezerAlbumInfo "${deezerAlbumID}"
deezerAlbumData="$(cat "/config/extended/cache/deezer/$deezerAlbumID.json")" deezerAlbumData="$(cat "/config/extended/cache/deezer/$deezerAlbumID.json")"
@ -1542,8 +1526,8 @@ FuzzyDeezerSearch () {
fi fi
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Checking for Match..." log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Checking for Match..."
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Calculating Similarity..." log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Calculating Damerau-Levenshtein distance..."
diff=$(levenshtein "${lidarrAlbumReleaseTitleClean,,}" "${deezerAlbumTitleClean,,}" 2>/dev/null) diff=$(python -c "from pyxdameraulevenshtein import damerau_levenshtein_distance; print(damerau_levenshtein_distance(\"${lidarrAlbumReleaseTitleClean,,}\", \"${deezerAlbumTitleClean,,}\"))" 2>/dev/null)
if [ "$diff" -le "$matchDistance" ]; then if [ "$diff" -le "$matchDistance" ]; then
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Deezer MATCH Found :: Calculated Difference = $diff" log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Deezer :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $deezerAlbumTitleClean :: Deezer MATCH Found :: Calculated Difference = $diff"
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Deezer :: $type :: $lidarrReleaseTitle :: Downloading $deezerAlbumTrackCount Tracks :: $deezerAlbumTitle ($downloadedReleaseYear)" log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Deezer :: $type :: $lidarrReleaseTitle :: Downloading $deezerAlbumTrackCount Tracks :: $deezerAlbumTitle ($downloadedReleaseYear)"
@ -1611,19 +1595,9 @@ ArtistTidalSearch () {
downloadedReleaseYear="${downloadedReleaseDate:0:4}" downloadedReleaseYear="${downloadedReleaseDate:0:4}"
downloadedTrackCount=$(echo "$tidalArtistAlbumData"| jq -r .numberOfTracks) downloadedTrackCount=$(echo "$tidalArtistAlbumData"| jq -r .numberOfTracks)
# String Character Count test, quicker than the levenshtein method to allow faster processing
characterMath=$(( ${#tidalAlbumTitleClean} - ${#lidarrAlbumReleaseTitleClean} ))
if [ "$characterMath" -gt "$matchDistance" ]; then
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Not a match..."
continue
elif [ "$characterMath" -lt "0" ]; then
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Not a match..."
continue
fi
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Checking for Match..." log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Checking for Match..."
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Calculating Similarity..." log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Calculating Damerau-Levenshtein distance..."
diff=$(levenshtein "${lidarrAlbumReleaseTitleClean,,}" "${tidalAlbumTitleClean,,}" 2>/dev/null) diff=$(python -c "from pyxdameraulevenshtein import damerau_levenshtein_distance; print(damerau_levenshtein_distance(\"${lidarrAlbumReleaseTitleClean,,}\", \"${tidalAlbumTitleClean,,}\"))" 2>/dev/null)
if [ "$diff" -le "$matchDistance" ]; then if [ "$diff" -le "$matchDistance" ]; then
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Tidal MATCH Found :: Calculated Difference = $diff" log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Artist Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Tidal MATCH Found :: Calculated Difference = $diff"
@ -1679,19 +1653,9 @@ FuzzyTidalSearch () {
downloadedReleaseYear="${downloadedReleaseDate:0:4}" downloadedReleaseYear="${downloadedReleaseDate:0:4}"
downloadedTrackCount=$(echo "$tidalAlbumData"| jq -r .numberOfTracks) downloadedTrackCount=$(echo "$tidalAlbumData"| jq -r .numberOfTracks)
# String Character Count test, quicker than the levenshtein method to allow faster processing
characterMath=$(( ${#tidalAlbumTitleClean} - ${#lidarrAlbumReleaseTitleClean} ))
if [ "$characterMath" -gt "$matchDistance" ]; then
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Not a match..."
continue
elif [ "$characterMath" -lt "0" ]; then
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Not a match..."
continue
fi
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Checking for Match..." log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Checking for Match..."
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Calculating Similarity..." log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Calculating Damerau-Levenshtein distance..."
diff=$(levenshtein "${lidarrAlbumReleaseTitleClean,,}" "${tidalAlbumTitleClean,,}" 2>/dev/null) diff=$(python -c "from pyxdameraulevenshtein import damerau_levenshtein_distance; print(damerau_levenshtein_distance(\"${lidarrAlbumReleaseTitleClean,,}\", \"${tidalAlbumTitleClean,,}\"))" 2>/dev/null)
if [ "$diff" -le "$matchDistance" ]; then if [ "$diff" -le "$matchDistance" ]; then
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Tidal MATCH Found :: Calculated Difference = $diff" log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Tidal :: $type :: $lidarrReleaseTitle :: $lidarrAlbumReleaseTitleClean vs $tidalAlbumTitleClean :: Tidal MATCH Found :: Calculated Difference = $diff"
log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Tidal :: $type :: $lidarrReleaseTitle :: Downloading $downloadedTrackCount Tracks :: $tidalAlbumTitle ($downloadedReleaseYear)" log "$1 :: $lidarrArtistName :: $lidarrAlbumTitle :: $lidarrAlbumType :: Fuzzy Search :: Tidal :: $type :: $lidarrReleaseTitle :: Downloading $downloadedTrackCount Tracks :: $tidalAlbumTitle ($downloadedReleaseYear)"
@ -1787,45 +1751,6 @@ LidarrMissingAlbumSearch () {
done done
} }
function levenshtein {
if [ "$1" == "$2" ]; then
echo 0
else
if (( $# != 2 )); then
echo "Usage: $0 word1 word2" >&2
elif (( ${#1} < ${#2} )); then
levenshtein "$2" "$1"
else
local str1len=${#1}
local str2len=${#2}
local d
for (( i = 0; i <= (str1len+1)*(str2len+1); i++ )); do
d[i]=0
done
for (( i = 0; i <= str1len; i++ )); do
d[i+0*str1len]=$i
done
for (( j = 0; j <= str2len; j++ )); do
d[0+j*(str1len+1)]=$j
done
for (( j = 1; j <= str2len; j++ )); do
for (( i = 1; i <= str1len; i++ )); do
[ "${1:i-1:1}" = "${2:j-1:1}" ] && local cost=0 || local cost=1
del=$(( d[(i-1)+str1len*j]+1 ))
ins=$(( d[i+str1len*(j-1)]+1 ))
alt=$(( d[(i-1)+str1len*(j-1)]+cost ))
d[i+str1len*j]=$( echo -e "$del\n$ins\n$alt" | sort -n | head -1 )
done
done
echo ${d[str1len+str1len*(str2len)]}
fi
fi
}
audioFlacVerification () { audioFlacVerification () {
# Test Flac File for errors # Test Flac File for errors
# $1 File for verification # $1 File for verification

View file

@ -1,6 +1,6 @@
#!/usr/bin/with-contenv bash #!/usr/bin/with-contenv bash
SMA_PATH="/usr/local/sma" SMA_PATH="/usr/local/sma"
version="1.0" version="1.1"
echo "*** install packages ***" && \ echo "*** install packages ***" && \
apk add -U --upgrade --no-cache \ apk add -U --upgrade --no-cache \
@ -27,6 +27,7 @@ echo "*** install python packages ***" && \
pip install --upgrade --no-cache-dir \ pip install --upgrade --no-cache-dir \
beets \ beets \
yq \ yq \
pyxDamerauLevenshtein \
pyacoustid \ pyacoustid \
requests \ requests \
pylast \ pylast \