use jaro-winkler algo

This commit is contained in:
Georges-Antoine Assi
2025-08-08 16:54:26 -04:00
parent 1a2944806b
commit 659a3eb104
8 changed files with 31 additions and 77 deletions

View File

@@ -3,19 +3,21 @@ import json
import os
import re
import unicodedata
from difflib import SequenceMatcher
from functools import lru_cache
from itertools import batched
from typing import Final, NotRequired, TypedDict
from handler.redis_handler import async_cache, sync_cache
from logger.logger import log
from strsimpy.jaro_winkler import JaroWinkler
from tasks.scheduled.update_switch_titledb import (
SWITCH_PRODUCT_ID_KEY,
SWITCH_TITLEDB_INDEX_KEY,
update_switch_titledb_task,
)
jarowinkler = JaroWinkler()
def conditionally_set_cache(
index_key: str, filename: str, parent_dir: str = os.path.dirname(__file__)
@@ -133,70 +135,6 @@ class MetadataHandler:
) -> str:
return _normalize_search_term(name, remove_articles, remove_punctuation)
def calculate_text_similarity(
self,
normalized_search_term: str,
game_name: str,
remove_articles: bool = True,
remove_punctuation: bool = True,
) -> float:
"""
Calculate similarity between search term and game name using multiple metrics.
Returns a score between 0 and 1, where 1 is a perfect match.
Args:
search_term: The search term to compare
game_name: The game name to compare against
Returns:
Similarity score between 0 and 1
"""
game_normalized = self.normalize_search_term(
game_name,
remove_articles=remove_articles,
remove_punctuation=remove_punctuation,
)
# Exact match gets the highest score
if normalized_search_term == game_normalized:
return 1.0
# Split into tokens for word-based matching
search_tokens = set(WORD_TOKEN_PATTERN.findall(normalized_search_term.lower()))
game_tokens = set(WORD_TOKEN_PATTERN.findall(game_normalized.lower()))
# Calculate token overlap ratio
if search_tokens and game_tokens:
intersection = search_tokens & game_tokens
union = search_tokens | game_tokens
token_overlap_ratio = len(intersection) / len(union)
else:
token_overlap_ratio = 0.0
# Calculate sequence similarity (better for longer strings)
sequence_ratio = SequenceMatcher(
None, normalized_search_term, game_normalized
).ratio()
# Calculate Wagner-Fischer distance (normalized by max length)
max_len = max(len(normalized_search_term), len(game_normalized))
if max_len > 0:
wagner_fischer_ratio = 1 - (
wagner_fischer_distance(normalized_search_term, game_normalized)
/ max_len
)
else:
wagner_fischer_ratio = 1.0
# Token overlap is most important for game titles
final_score = (
token_overlap_ratio * 0.5
+ sequence_ratio * 0.3
+ wagner_fischer_ratio * 0.2
)
return final_score
def find_best_match(
self,
normalized_search_term: str,
@@ -223,12 +161,18 @@ class MetadataHandler:
best_score = 0.0
for game_name in game_names:
score = self.calculate_text_similarity(
normalized_search_term,
# score = self.calculate_text_similarity(
# normalized_search_term,
# game_name,
# remove_articles=remove_articles,
# remove_punctuation=remove_punctuation,
# )
game_normalized = self.normalize_search_term(
game_name,
remove_articles=remove_articles,
remove_punctuation=remove_punctuation,
)
score = JaroWinkler().similarity(normalized_search_term, game_normalized)
if score > best_score:
best_score = score
best_match = game_name

View File

@@ -223,7 +223,6 @@ class IGDBHandler(MetadataHandler):
"Client-ID": IGDB_CLIENT_ID,
"Accept": "application/json",
}
self.min_similarity_score: Final = 0.75
@staticmethod
def check_twitch_token(func):
@@ -343,7 +342,6 @@ class IGDBHandler(MetadataHandler):
best_match, best_score = self.find_best_match(
search_term,
list(games_by_name.keys()),
min_similarity_score=self.min_similarity_score,
remove_punctuation=False,
)
if best_match:
@@ -372,7 +370,6 @@ class IGDBHandler(MetadataHandler):
best_match, best_score = self.find_best_match(
search_term,
list(games_by_name.keys()),
min_similarity_score=self.min_similarity_score,
remove_punctuation=False,
)
if best_match:

View File

@@ -77,7 +77,6 @@ def extract_metadata_from_moby_rom(rom: MobyGame) -> MobyMetadata:
class MobyGamesHandler(MetadataHandler):
def __init__(self) -> None:
self.moby_service = MobyGamesService()
self.min_similarity_score: Final = 0.75
async def _search_rom(
self, search_term: str, platform_moby_id: int
@@ -96,7 +95,6 @@ class MobyGamesHandler(MetadataHandler):
best_match, best_score = self.find_best_match(
search_term,
list(games_by_name.keys()),
min_similarity_score=self.min_similarity_score,
remove_punctuation=False,
)
if best_match:

View File

@@ -31,7 +31,7 @@ class SGDBRom(TypedDict):
class SGDBBaseHandler(MetadataHandler):
def __init__(self) -> None:
self.sgdb_service = SteamGridDBService()
self.min_similarity_score: Final = 0.75
self.min_similarity_score: Final = 0.95
async def get_details(self, search_term: str) -> list[SGDBResult]:
if not STEAMGRIDDB_API_ENABLED:

View File

@@ -278,7 +278,6 @@ def extract_metadata_from_ss_rom(rom: SSGame) -> SSMetadata:
class SSHandler(MetadataHandler):
def __init__(self) -> None:
self.ss_service = ScreenScraperService()
self.min_similarity_score: Final = 0.75
async def _search_rom(self, search_term: str, platform_ss_id: int) -> SSGame | None:
if not platform_ss_id:
@@ -296,7 +295,6 @@ class SSHandler(MetadataHandler):
best_match, best_score = self.find_best_match(
search_term,
list(games_by_name.keys()),
min_similarity_score=self.min_similarity_score,
remove_punctuation=False,
)
if best_match:

View File

@@ -159,7 +159,12 @@ const hashMatches = computed(() => {
<v-img src="/assets/scrappers/moby.png" />
</v-avatar>
<span>{{ rom.moby_id }}</span>
<template v-if="rom.moby_metadata?.moby_score">
<template
v-if="
rom.moby_metadata?.moby_score &&
rom.moby_metadata.moby_score !== 'None'
"
>
<v-divider class="mx-2 border-opacity-25" vertical />
<span>{{
(parseFloat(rom.moby_metadata.moby_score) * 10).toFixed(2)

View File

@@ -41,6 +41,7 @@ dependencies = [
"sentry-sdk ~= 2.32",
"starlette-csrf ~= 3.0",
"streaming-form-data ~= 1.19",
"strsimpy>=0.2.1",
"types-colorama ~= 0.4",
"types-passlib ~= 1.7",
"types-pyyaml ~= 6.0",

13
uv.lock generated
View File

@@ -1,5 +1,5 @@
version = 1
revision = 3
revision = 2
requires-python = ">=3.13"
resolution-markers = [
"platform_python_implementation != 'PyPy'",
@@ -1715,6 +1715,7 @@ dependencies = [
{ name = "sqlalchemy", extra = ["mariadb-connector", "mysql-connector", "postgresql-psycopg"] },
{ name = "starlette-csrf" },
{ name = "streaming-form-data" },
{ name = "strsimpy" },
{ name = "types-colorama" },
{ name = "types-passlib" },
{ name = "types-pyyaml" },
@@ -1787,6 +1788,7 @@ requires-dist = [
{ name = "sqlalchemy", extras = ["mariadb-connector", "mysql-connector", "postgresql-psycopg"], specifier = "~=2.0" },
{ name = "starlette-csrf", specifier = "~=3.0" },
{ name = "streaming-form-data", specifier = "~=1.19" },
{ name = "strsimpy", specifier = ">=0.2.1" },
{ name = "types-colorama", specifier = "~=0.4" },
{ name = "types-passlib", specifier = "~=1.7" },
{ name = "types-pyyaml", specifier = "~=6.0" },
@@ -2001,6 +2003,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/5d/53/a709d8925a0e48bc4904f12e1f619b0295042c06d66aacaa213f7a18a927/streaming_form_data-1.19.1-cp313-cp313-win_amd64.whl", hash = "sha256:e2dee016f1db735cd91e97421340cd3799f9fd46b1e39e4a11d6215c7cbe1edc", size = 201927, upload-time = "2025-01-10T18:33:07.6Z" },
]
[[package]]
name = "strsimpy"
version = "0.2.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/0d/7e/5ccf2edfa1e97154dbf3119fd240b1f5fbe32ad1edd1db5f7a94d3f7a037/strsimpy-0.2.1.tar.gz", hash = "sha256:0842eb57f7af86c882a59a1bc8721ec2580a267e563fd0503ced2972040372c9", size = 24403, upload-time = "2021-09-10T09:14:20.405Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/fc/90/bd55a4b18f4b75a76e32f444975d2c869d692eb23897d116d47122f88d1a/strsimpy-0.2.1-py3-none-any.whl", hash = "sha256:d676a440d5d3dbcf5ba92d01814a03a218776ce07bd7a8185da7019e04cf9ba7", size = 45870, upload-time = "2021-09-10T09:14:18.944Z" },
]
[[package]]
name = "texttable"
version = "1.7.0"