diff --git a/SCRATCHPAD.md b/SCRATCHPAD.md index a1ba86a..28da596 100644 --- a/SCRATCHPAD.md +++ b/SCRATCHPAD.md @@ -13,6 +13,7 @@ - Shared CLI defaults for container/output tokens now live outside [`src/ffx/ffx_controller.py`](/home/osgw/.local/src/codex/ffx/src/ffx/ffx_controller.py), and a focused unit test locks in the lazy-import contract. - `FileProperties` now uses one cached `ffprobe -show_format -show_streams -of json` call per source file, and the combined payload was confirmed against the Dragonball asset to satisfy both previous probe call sites fully. - Database startup now bootstraps schema only when required tables are actually missing, while version enforcement still runs on ordinary DB-backed context creation. +- Helper filename and rich-text utilities now use compiled raw regexes plus translate-based filename filtering, with unit coverage for TMDB suffix rewriting and Rich color stripping. - FFX logger setup now reuses named handlers, and fallback logger access no longer mutates handlers in ordinary constructors and helpers. - The process wrapper now uses `subprocess.run(...)` with centralized command formatting plus stable timeout and missing-command error mapping. - Active ORM controllers now use single-query accessors instead of paired `count()` plus `first()` lookups. @@ -95,15 +96,6 @@ - Fewer surprises in production-like runs. - Easier support for user-reported performance behavior. -8. Regex and string utility cleanup -- [`src/ffx/helper.py`](/home/osgw/.local/src/codex/ffx/src/ffx/helper.py) still has repeated string-replacement churn in filename/TMDB normalization helpers, and regex handling in helpers is easy to regress quietly. -- Optimization: - - Keep regex literals raw and centralized where appropriate. - - Review filename and TMDB substitution helpers for repeated string churn. -- Expected value: - - Cleaner runtime output. - - Less warning noise during dry-run maintenance commands. - ## Open - Should optimization work focus first on operator-perceived latency, internal maintainability, or correctness-risk cleanup that also has performance upside? diff --git a/src/ffx/helper.py b/src/ffx/helper.py index cbb8e46..742dbc1 100644 --- a/src/ffx/helper.py +++ b/src/ffx/helper.py @@ -16,7 +16,21 @@ DIFF_REMOVED_KEY = 'removed' DIFF_CHANGED_KEY = 'changed' DIFF_UNCHANGED_KEY = 'unchanged' -RICH_COLOR_PATTERN = '\\[[a-z_]+\\](.+)\\[\\/[a-z_]+\\]' +FILENAME_FILTER_TRANSLATION = str.maketrans( + { + "/": "-", + ":": ";", + "*": "", + "'": "", + "?": "#", + "♥": "", + "’": "", + } +) +TMDB_FILLER_MARKERS = (" (*)", "(*)") +TMDB_EPISODE_RANGE_SUFFIX_REGEX = re.compile(r"\(([0-9]+)[-/]([0-9]+)\)$") +TMDB_EPISODE_PART_SUFFIX_REGEX = re.compile(r"\(([0-9]+)\)$") +RICH_COLOR_REGEX = re.compile(r"\[[a-z_]+\](.+)\[/[a-z_]+\]") def dictDiff(a : dict, b : dict, ignoreKeys: list = [], removeKeys: list = []): @@ -115,39 +129,35 @@ def filterFilename(fileName: str) -> str: """This filter replaces charactes from TMDB responses with characters less problemating when using in filenames or removes them""" - fileName = str(fileName).replace('/', '-') - fileName = str(fileName).replace(':', ';') - fileName = str(fileName).replace('*', '') - fileName = str(fileName).replace("'", '') - fileName = str(fileName).replace("?", '#') - fileName = str(fileName).replace('♥', '') - fileName = str(fileName).replace('’', '') - - return fileName.strip() + return str(fileName).translate(FILENAME_FILTER_TRANSLATION).strip() def substituteTmdbFilename(fileName: str) -> str: """If chaining this method with filterFilename use this one first as the latter will destroy some patterns""" - # This indicates filler episodes in TMDB episode names - fileName = str(fileName).replace(' (*)', '') - fileName = str(fileName).replace('(*)', '') + normalizedFileName = str(fileName) - # This indicates the index of multi-episode files - episodePartMatch = re.search("\\(([0-9]+)\\)$", fileName) + for fillerMarker in TMDB_FILLER_MARKERS: + normalizedFileName = normalizedFileName.replace(fillerMarker, '') + + episodeRangeMatch = TMDB_EPISODE_RANGE_SUFFIX_REGEX.search(normalizedFileName) + if episodeRangeMatch is not None: + partFirstIndex, partLastIndex = episodeRangeMatch.groups() + return TMDB_EPISODE_RANGE_SUFFIX_REGEX.sub( + f"Teil {partFirstIndex}-{partLastIndex}", + normalizedFileName, + count=1, + ) + + episodePartMatch = TMDB_EPISODE_PART_SUFFIX_REGEX.search(normalizedFileName) if episodePartMatch is not None: - partSuffix = str(episodePartMatch.group(0)) - partIndex = episodePartMatch.groups()[0] - fileName = str(fileName).replace(partSuffix, f"Teil {partIndex}") + partIndex = episodePartMatch.group(1) + return TMDB_EPISODE_PART_SUFFIX_REGEX.sub( + f"Teil {partIndex}", + normalizedFileName, + count=1, + ) - # Also multi-episodes with first and last episode index - episodePartMatch = re.search("\\(([0-9]+)[-\\/]([0-9]+)\\)$", fileName) - if episodePartMatch is not None: - partSuffix = str(episodePartMatch.group(0)) - partFirstIndex = episodePartMatch.groups()[0] - partLastIndex = episodePartMatch.groups()[1] - fileName = str(fileName).replace(partSuffix, f"Teil {partFirstIndex}-{partLastIndex}") - - return fileName + return normalizedFileName def getEpisodeFileBasename(showName, @@ -231,7 +241,7 @@ def formatRichColor(text: str, color: str = None): return f"[{color}]{text}[/{color}]" def removeRichColor(text: str): - richColorMatch = re.search(RICH_COLOR_PATTERN, text) + richColorMatch = RICH_COLOR_REGEX.search(str(text)) if richColorMatch is None: return text else: diff --git a/tests/unit/test_helper.py b/tests/unit/test_helper.py new file mode 100644 index 0000000..450877d --- /dev/null +++ b/tests/unit/test_helper.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from pathlib import Path +import sys +import unittest + + +SRC_ROOT = Path(__file__).resolve().parents[2] / "src" + +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + + +from ffx.helper import ( # noqa: E402 + filterFilename, + formatRichColor, + removeRichColor, + substituteTmdbFilename, +) + + +class HelperTests(unittest.TestCase): + def test_filter_filename_replaces_and_removes_problem_characters(self): + self.assertEqual( + "A-B;C#", + filterFilename(" A/B:C*'?♥’ "), + ) + + def test_substitute_tmdb_filename_removes_filler_marker(self): + self.assertEqual( + "Episode Name", + substituteTmdbFilename("Episode Name (*)"), + ) + + def test_substitute_tmdb_filename_rewrites_single_episode_suffix(self): + self.assertEqual( + "Episode Name Teil 2", + substituteTmdbFilename("Episode Name (2)"), + ) + + def test_substitute_tmdb_filename_rewrites_episode_range_suffix(self): + self.assertEqual( + "Episode Name Teil 2-3", + substituteTmdbFilename("Episode Name (2/3)"), + ) + + def test_remove_rich_color_returns_inner_text(self): + self.assertEqual( + "value", + removeRichColor(formatRichColor("value", "green")), + ) + + def test_remove_rich_color_leaves_plain_text_unchanged(self): + self.assertEqual( + "plain text", + removeRichColor("plain text"), + ) + + +if __name__ == "__main__": + unittest.main()