From 659a3eb104fdfa892633b0536a758fe0b056f021 Mon Sep 17 00:00:00 2001 From: Georges-Antoine Assi Date: Fri, 8 Aug 2025 16:54:26 -0400 Subject: [PATCH] use jaro-winkler algo --- backend/handler/metadata/base_hander.py | 78 ++++------------------- backend/handler/metadata/igdb_handler.py | 3 - backend/handler/metadata/moby_handler.py | 2 - backend/handler/metadata/sgdb_handler.py | 2 +- backend/handler/metadata/ss_handler.py | 2 - frontend/src/components/Details/Title.vue | 7 +- pyproject.toml | 1 + uv.lock | 13 +++- 8 files changed, 31 insertions(+), 77 deletions(-) diff --git a/backend/handler/metadata/base_hander.py b/backend/handler/metadata/base_hander.py index 3618ab6ee..d805643e6 100644 --- a/backend/handler/metadata/base_hander.py +++ b/backend/handler/metadata/base_hander.py @@ -3,19 +3,21 @@ import json import os import re import unicodedata -from difflib import SequenceMatcher from functools import lru_cache from itertools import batched from typing import Final, NotRequired, TypedDict from handler.redis_handler import async_cache, sync_cache from logger.logger import log +from strsimpy.jaro_winkler import JaroWinkler from tasks.scheduled.update_switch_titledb import ( SWITCH_PRODUCT_ID_KEY, SWITCH_TITLEDB_INDEX_KEY, update_switch_titledb_task, ) +jarowinkler = JaroWinkler() + def conditionally_set_cache( index_key: str, filename: str, parent_dir: str = os.path.dirname(__file__) @@ -133,70 +135,6 @@ class MetadataHandler: ) -> str: return _normalize_search_term(name, remove_articles, remove_punctuation) - def calculate_text_similarity( - self, - normalized_search_term: str, - game_name: str, - remove_articles: bool = True, - remove_punctuation: bool = True, - ) -> float: - """ - Calculate similarity between search term and game name using multiple metrics. - Returns a score between 0 and 1, where 1 is a perfect match. - - Args: - search_term: The search term to compare - game_name: The game name to compare against - - Returns: - Similarity score between 0 and 1 - """ - game_normalized = self.normalize_search_term( - game_name, - remove_articles=remove_articles, - remove_punctuation=remove_punctuation, - ) - - # Exact match gets the highest score - if normalized_search_term == game_normalized: - return 1.0 - - # Split into tokens for word-based matching - search_tokens = set(WORD_TOKEN_PATTERN.findall(normalized_search_term.lower())) - game_tokens = set(WORD_TOKEN_PATTERN.findall(game_normalized.lower())) - - # Calculate token overlap ratio - if search_tokens and game_tokens: - intersection = search_tokens & game_tokens - union = search_tokens | game_tokens - token_overlap_ratio = len(intersection) / len(union) - else: - token_overlap_ratio = 0.0 - - # Calculate sequence similarity (better for longer strings) - sequence_ratio = SequenceMatcher( - None, normalized_search_term, game_normalized - ).ratio() - - # Calculate Wagner-Fischer distance (normalized by max length) - max_len = max(len(normalized_search_term), len(game_normalized)) - if max_len > 0: - wagner_fischer_ratio = 1 - ( - wagner_fischer_distance(normalized_search_term, game_normalized) - / max_len - ) - else: - wagner_fischer_ratio = 1.0 - - # Token overlap is most important for game titles - final_score = ( - token_overlap_ratio * 0.5 - + sequence_ratio * 0.3 - + wagner_fischer_ratio * 0.2 - ) - - return final_score - def find_best_match( self, normalized_search_term: str, @@ -223,12 +161,18 @@ class MetadataHandler: best_score = 0.0 for game_name in game_names: - score = self.calculate_text_similarity( - normalized_search_term, + # score = self.calculate_text_similarity( + # normalized_search_term, + # game_name, + # remove_articles=remove_articles, + # remove_punctuation=remove_punctuation, + # ) + game_normalized = self.normalize_search_term( game_name, remove_articles=remove_articles, remove_punctuation=remove_punctuation, ) + score = JaroWinkler().similarity(normalized_search_term, game_normalized) if score > best_score: best_score = score best_match = game_name diff --git a/backend/handler/metadata/igdb_handler.py b/backend/handler/metadata/igdb_handler.py index 929c4173b..e267754ed 100644 --- a/backend/handler/metadata/igdb_handler.py +++ b/backend/handler/metadata/igdb_handler.py @@ -223,7 +223,6 @@ class IGDBHandler(MetadataHandler): "Client-ID": IGDB_CLIENT_ID, "Accept": "application/json", } - self.min_similarity_score: Final = 0.75 @staticmethod def check_twitch_token(func): @@ -343,7 +342,6 @@ class IGDBHandler(MetadataHandler): best_match, best_score = self.find_best_match( search_term, list(games_by_name.keys()), - min_similarity_score=self.min_similarity_score, remove_punctuation=False, ) if best_match: @@ -372,7 +370,6 @@ class IGDBHandler(MetadataHandler): best_match, best_score = self.find_best_match( search_term, list(games_by_name.keys()), - min_similarity_score=self.min_similarity_score, remove_punctuation=False, ) if best_match: diff --git a/backend/handler/metadata/moby_handler.py b/backend/handler/metadata/moby_handler.py index 799acce3f..1b02f162c 100644 --- a/backend/handler/metadata/moby_handler.py +++ b/backend/handler/metadata/moby_handler.py @@ -77,7 +77,6 @@ def extract_metadata_from_moby_rom(rom: MobyGame) -> MobyMetadata: class MobyGamesHandler(MetadataHandler): def __init__(self) -> None: self.moby_service = MobyGamesService() - self.min_similarity_score: Final = 0.75 async def _search_rom( self, search_term: str, platform_moby_id: int @@ -96,7 +95,6 @@ class MobyGamesHandler(MetadataHandler): best_match, best_score = self.find_best_match( search_term, list(games_by_name.keys()), - min_similarity_score=self.min_similarity_score, remove_punctuation=False, ) if best_match: diff --git a/backend/handler/metadata/sgdb_handler.py b/backend/handler/metadata/sgdb_handler.py index b7b71f21d..67c772a6d 100644 --- a/backend/handler/metadata/sgdb_handler.py +++ b/backend/handler/metadata/sgdb_handler.py @@ -31,7 +31,7 @@ class SGDBRom(TypedDict): class SGDBBaseHandler(MetadataHandler): def __init__(self) -> None: self.sgdb_service = SteamGridDBService() - self.min_similarity_score: Final = 0.75 + self.min_similarity_score: Final = 0.95 async def get_details(self, search_term: str) -> list[SGDBResult]: if not STEAMGRIDDB_API_ENABLED: diff --git a/backend/handler/metadata/ss_handler.py b/backend/handler/metadata/ss_handler.py index 256be0a40..4bddd70b3 100644 --- a/backend/handler/metadata/ss_handler.py +++ b/backend/handler/metadata/ss_handler.py @@ -278,7 +278,6 @@ def extract_metadata_from_ss_rom(rom: SSGame) -> SSMetadata: class SSHandler(MetadataHandler): def __init__(self) -> None: self.ss_service = ScreenScraperService() - self.min_similarity_score: Final = 0.75 async def _search_rom(self, search_term: str, platform_ss_id: int) -> SSGame | None: if not platform_ss_id: @@ -296,7 +295,6 @@ class SSHandler(MetadataHandler): best_match, best_score = self.find_best_match( search_term, list(games_by_name.keys()), - min_similarity_score=self.min_similarity_score, remove_punctuation=False, ) if best_match: diff --git a/frontend/src/components/Details/Title.vue b/frontend/src/components/Details/Title.vue index 26203d1d6..1069e547f 100644 --- a/frontend/src/components/Details/Title.vue +++ b/frontend/src/components/Details/Title.vue @@ -159,7 +159,12 @@ const hashMatches = computed(() => { {{ rom.moby_id }} -