mirror of
https://github.com/rommapp/romm.git
synced 2026-02-18 00:27:41 +01:00
use jaro-winkler algo
This commit is contained in:
@@ -3,19 +3,21 @@ import json
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
from difflib import SequenceMatcher
|
||||
from functools import lru_cache
|
||||
from itertools import batched
|
||||
from typing import Final, NotRequired, TypedDict
|
||||
|
||||
from handler.redis_handler import async_cache, sync_cache
|
||||
from logger.logger import log
|
||||
from strsimpy.jaro_winkler import JaroWinkler
|
||||
from tasks.scheduled.update_switch_titledb import (
|
||||
SWITCH_PRODUCT_ID_KEY,
|
||||
SWITCH_TITLEDB_INDEX_KEY,
|
||||
update_switch_titledb_task,
|
||||
)
|
||||
|
||||
jarowinkler = JaroWinkler()
|
||||
|
||||
|
||||
def conditionally_set_cache(
|
||||
index_key: str, filename: str, parent_dir: str = os.path.dirname(__file__)
|
||||
@@ -133,70 +135,6 @@ class MetadataHandler:
|
||||
) -> str:
|
||||
return _normalize_search_term(name, remove_articles, remove_punctuation)
|
||||
|
||||
def calculate_text_similarity(
|
||||
self,
|
||||
normalized_search_term: str,
|
||||
game_name: str,
|
||||
remove_articles: bool = True,
|
||||
remove_punctuation: bool = True,
|
||||
) -> float:
|
||||
"""
|
||||
Calculate similarity between search term and game name using multiple metrics.
|
||||
Returns a score between 0 and 1, where 1 is a perfect match.
|
||||
|
||||
Args:
|
||||
search_term: The search term to compare
|
||||
game_name: The game name to compare against
|
||||
|
||||
Returns:
|
||||
Similarity score between 0 and 1
|
||||
"""
|
||||
game_normalized = self.normalize_search_term(
|
||||
game_name,
|
||||
remove_articles=remove_articles,
|
||||
remove_punctuation=remove_punctuation,
|
||||
)
|
||||
|
||||
# Exact match gets the highest score
|
||||
if normalized_search_term == game_normalized:
|
||||
return 1.0
|
||||
|
||||
# Split into tokens for word-based matching
|
||||
search_tokens = set(WORD_TOKEN_PATTERN.findall(normalized_search_term.lower()))
|
||||
game_tokens = set(WORD_TOKEN_PATTERN.findall(game_normalized.lower()))
|
||||
|
||||
# Calculate token overlap ratio
|
||||
if search_tokens and game_tokens:
|
||||
intersection = search_tokens & game_tokens
|
||||
union = search_tokens | game_tokens
|
||||
token_overlap_ratio = len(intersection) / len(union)
|
||||
else:
|
||||
token_overlap_ratio = 0.0
|
||||
|
||||
# Calculate sequence similarity (better for longer strings)
|
||||
sequence_ratio = SequenceMatcher(
|
||||
None, normalized_search_term, game_normalized
|
||||
).ratio()
|
||||
|
||||
# Calculate Wagner-Fischer distance (normalized by max length)
|
||||
max_len = max(len(normalized_search_term), len(game_normalized))
|
||||
if max_len > 0:
|
||||
wagner_fischer_ratio = 1 - (
|
||||
wagner_fischer_distance(normalized_search_term, game_normalized)
|
||||
/ max_len
|
||||
)
|
||||
else:
|
||||
wagner_fischer_ratio = 1.0
|
||||
|
||||
# Token overlap is most important for game titles
|
||||
final_score = (
|
||||
token_overlap_ratio * 0.5
|
||||
+ sequence_ratio * 0.3
|
||||
+ wagner_fischer_ratio * 0.2
|
||||
)
|
||||
|
||||
return final_score
|
||||
|
||||
def find_best_match(
|
||||
self,
|
||||
normalized_search_term: str,
|
||||
@@ -223,12 +161,18 @@ class MetadataHandler:
|
||||
best_score = 0.0
|
||||
|
||||
for game_name in game_names:
|
||||
score = self.calculate_text_similarity(
|
||||
normalized_search_term,
|
||||
# score = self.calculate_text_similarity(
|
||||
# normalized_search_term,
|
||||
# game_name,
|
||||
# remove_articles=remove_articles,
|
||||
# remove_punctuation=remove_punctuation,
|
||||
# )
|
||||
game_normalized = self.normalize_search_term(
|
||||
game_name,
|
||||
remove_articles=remove_articles,
|
||||
remove_punctuation=remove_punctuation,
|
||||
)
|
||||
score = JaroWinkler().similarity(normalized_search_term, game_normalized)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = game_name
|
||||
|
||||
@@ -223,7 +223,6 @@ class IGDBHandler(MetadataHandler):
|
||||
"Client-ID": IGDB_CLIENT_ID,
|
||||
"Accept": "application/json",
|
||||
}
|
||||
self.min_similarity_score: Final = 0.75
|
||||
|
||||
@staticmethod
|
||||
def check_twitch_token(func):
|
||||
@@ -343,7 +342,6 @@ class IGDBHandler(MetadataHandler):
|
||||
best_match, best_score = self.find_best_match(
|
||||
search_term,
|
||||
list(games_by_name.keys()),
|
||||
min_similarity_score=self.min_similarity_score,
|
||||
remove_punctuation=False,
|
||||
)
|
||||
if best_match:
|
||||
@@ -372,7 +370,6 @@ class IGDBHandler(MetadataHandler):
|
||||
best_match, best_score = self.find_best_match(
|
||||
search_term,
|
||||
list(games_by_name.keys()),
|
||||
min_similarity_score=self.min_similarity_score,
|
||||
remove_punctuation=False,
|
||||
)
|
||||
if best_match:
|
||||
|
||||
@@ -77,7 +77,6 @@ def extract_metadata_from_moby_rom(rom: MobyGame) -> MobyMetadata:
|
||||
class MobyGamesHandler(MetadataHandler):
|
||||
def __init__(self) -> None:
|
||||
self.moby_service = MobyGamesService()
|
||||
self.min_similarity_score: Final = 0.75
|
||||
|
||||
async def _search_rom(
|
||||
self, search_term: str, platform_moby_id: int
|
||||
@@ -96,7 +95,6 @@ class MobyGamesHandler(MetadataHandler):
|
||||
best_match, best_score = self.find_best_match(
|
||||
search_term,
|
||||
list(games_by_name.keys()),
|
||||
min_similarity_score=self.min_similarity_score,
|
||||
remove_punctuation=False,
|
||||
)
|
||||
if best_match:
|
||||
|
||||
@@ -31,7 +31,7 @@ class SGDBRom(TypedDict):
|
||||
class SGDBBaseHandler(MetadataHandler):
|
||||
def __init__(self) -> None:
|
||||
self.sgdb_service = SteamGridDBService()
|
||||
self.min_similarity_score: Final = 0.75
|
||||
self.min_similarity_score: Final = 0.95
|
||||
|
||||
async def get_details(self, search_term: str) -> list[SGDBResult]:
|
||||
if not STEAMGRIDDB_API_ENABLED:
|
||||
|
||||
@@ -278,7 +278,6 @@ def extract_metadata_from_ss_rom(rom: SSGame) -> SSMetadata:
|
||||
class SSHandler(MetadataHandler):
|
||||
def __init__(self) -> None:
|
||||
self.ss_service = ScreenScraperService()
|
||||
self.min_similarity_score: Final = 0.75
|
||||
|
||||
async def _search_rom(self, search_term: str, platform_ss_id: int) -> SSGame | None:
|
||||
if not platform_ss_id:
|
||||
@@ -296,7 +295,6 @@ class SSHandler(MetadataHandler):
|
||||
best_match, best_score = self.find_best_match(
|
||||
search_term,
|
||||
list(games_by_name.keys()),
|
||||
min_similarity_score=self.min_similarity_score,
|
||||
remove_punctuation=False,
|
||||
)
|
||||
if best_match:
|
||||
|
||||
@@ -159,7 +159,12 @@ const hashMatches = computed(() => {
|
||||
<v-img src="/assets/scrappers/moby.png" />
|
||||
</v-avatar>
|
||||
<span>{{ rom.moby_id }}</span>
|
||||
<template v-if="rom.moby_metadata?.moby_score">
|
||||
<template
|
||||
v-if="
|
||||
rom.moby_metadata?.moby_score &&
|
||||
rom.moby_metadata.moby_score !== 'None'
|
||||
"
|
||||
>
|
||||
<v-divider class="mx-2 border-opacity-25" vertical />
|
||||
<span>{{
|
||||
(parseFloat(rom.moby_metadata.moby_score) * 10).toFixed(2)
|
||||
|
||||
@@ -41,6 +41,7 @@ dependencies = [
|
||||
"sentry-sdk ~= 2.32",
|
||||
"starlette-csrf ~= 3.0",
|
||||
"streaming-form-data ~= 1.19",
|
||||
"strsimpy>=0.2.1",
|
||||
"types-colorama ~= 0.4",
|
||||
"types-passlib ~= 1.7",
|
||||
"types-pyyaml ~= 6.0",
|
||||
|
||||
13
uv.lock
generated
13
uv.lock
generated
@@ -1,5 +1,5 @@
|
||||
version = 1
|
||||
revision = 3
|
||||
revision = 2
|
||||
requires-python = ">=3.13"
|
||||
resolution-markers = [
|
||||
"platform_python_implementation != 'PyPy'",
|
||||
@@ -1715,6 +1715,7 @@ dependencies = [
|
||||
{ name = "sqlalchemy", extra = ["mariadb-connector", "mysql-connector", "postgresql-psycopg"] },
|
||||
{ name = "starlette-csrf" },
|
||||
{ name = "streaming-form-data" },
|
||||
{ name = "strsimpy" },
|
||||
{ name = "types-colorama" },
|
||||
{ name = "types-passlib" },
|
||||
{ name = "types-pyyaml" },
|
||||
@@ -1787,6 +1788,7 @@ requires-dist = [
|
||||
{ name = "sqlalchemy", extras = ["mariadb-connector", "mysql-connector", "postgresql-psycopg"], specifier = "~=2.0" },
|
||||
{ name = "starlette-csrf", specifier = "~=3.0" },
|
||||
{ name = "streaming-form-data", specifier = "~=1.19" },
|
||||
{ name = "strsimpy", specifier = ">=0.2.1" },
|
||||
{ name = "types-colorama", specifier = "~=0.4" },
|
||||
{ name = "types-passlib", specifier = "~=1.7" },
|
||||
{ name = "types-pyyaml", specifier = "~=6.0" },
|
||||
@@ -2001,6 +2003,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5d/53/a709d8925a0e48bc4904f12e1f619b0295042c06d66aacaa213f7a18a927/streaming_form_data-1.19.1-cp313-cp313-win_amd64.whl", hash = "sha256:e2dee016f1db735cd91e97421340cd3799f9fd46b1e39e4a11d6215c7cbe1edc", size = 201927, upload-time = "2025-01-10T18:33:07.6Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsimpy"
|
||||
version = "0.2.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/0d/7e/5ccf2edfa1e97154dbf3119fd240b1f5fbe32ad1edd1db5f7a94d3f7a037/strsimpy-0.2.1.tar.gz", hash = "sha256:0842eb57f7af86c882a59a1bc8721ec2580a267e563fd0503ced2972040372c9", size = 24403, upload-time = "2021-09-10T09:14:20.405Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/fc/90/bd55a4b18f4b75a76e32f444975d2c869d692eb23897d116d47122f88d1a/strsimpy-0.2.1-py3-none-any.whl", hash = "sha256:d676a440d5d3dbcf5ba92d01814a03a218776ce07bd7a8185da7019e04cf9ba7", size = 45870, upload-time = "2021-09-10T09:14:18.944Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "texttable"
|
||||
version = "1.7.0"
|
||||
|
||||
Reference in New Issue
Block a user