Fix regex issues

2026-04-11 16:10:41 +02:00
parent fc729a2414
commit 358ef18f77
3 changed files with 100 additions and 37 deletions
--- a/SCRATCHPAD.md
+++ b/SCRATCHPAD.md
@@ -13,6 +13,7 @@
 - Shared CLI defaults for container/output tokens now live outside [`src/ffx/ffx_controller.py`](/home/osgw/.local/src/codex/ffx/src/ffx/ffx_controller.py), and a focused unit test locks in the lazy-import contract.
 - `FileProperties` now uses one cached `ffprobe -show_format -show_streams -of json` call per source file, and the combined payload was confirmed against the Dragonball asset to satisfy both previous probe call sites fully.
 - Database startup now bootstraps schema only when required tables are actually missing, while version enforcement still runs on ordinary DB-backed context creation.
+- Helper filename and rich-text utilities now use compiled raw regexes plus translate-based filename filtering, with unit coverage for TMDB suffix rewriting and Rich color stripping.
 - FFX logger setup now reuses named handlers, and fallback logger access no longer mutates handlers in ordinary constructors and helpers.
 - The process wrapper now uses `subprocess.run(...)` with centralized command formatting plus stable timeout and missing-command error mapping.
 - Active ORM controllers now use single-query accessors instead of paired `count()` plus `first()` lookups.
@@ -95,15 +96,6 @@
  - Fewer surprises in production-like runs.
  - Easier support for user-reported performance behavior.

-8. Regex and string utility cleanup
- [`src/ffx/helper.py`](/home/osgw/.local/src/codex/ffx/src/ffx/helper.py) still has repeated string-replacement churn in filename/TMDB normalization helpers, and regex handling in helpers is easy to regress quietly.
- Optimization:
-  - Keep regex literals raw and centralized where appropriate.
-  - Review filename and TMDB substitution helpers for repeated string churn.
- Expected value:
-  - Cleaner runtime output.
-  - Less warning noise during dry-run maintenance commands.
-
 ## Open

 - Should optimization work focus first on operator-perceived latency, internal maintainability, or correctness-risk cleanup that also has performance upside?
--- a/src/ffx/helper.py
+++ b/src/ffx/helper.py
@@ -16,7 +16,21 @@ DIFF_REMOVED_KEY = 'removed'
 DIFF_CHANGED_KEY = 'changed'
 DIFF_UNCHANGED_KEY = 'unchanged'

-RICH_COLOR_PATTERN = '\\[[a-z_]+\\](.+)\\[\\/[a-z_]+\\]'
+FILENAME_FILTER_TRANSLATION = str.maketrans(
+    {
+        "/": "-",
+        ":": ";",
+        "*": "",
+        "'": "",
+        "?": "#",
+        "♥": "",
+        "’": "",
+    }
+)
+TMDB_FILLER_MARKERS = (" (*)", "(*)")
+TMDB_EPISODE_RANGE_SUFFIX_REGEX = re.compile(r"\(([0-9]+)[-/]([0-9]+)\)$")
+TMDB_EPISODE_PART_SUFFIX_REGEX = re.compile(r"\(([0-9]+)\)$")
+RICH_COLOR_REGEX = re.compile(r"\[[a-z_]+\](.+)\[/[a-z_]+\]")


 def dictDiff(a : dict, b : dict, ignoreKeys: list = [], removeKeys: list = []):
@@ -115,39 +129,35 @@ def filterFilename(fileName: str) -> str:
    """This filter replaces charactes from TMDB responses with characters
    less problemating when using in filenames or removes them"""

-    fileName = str(fileName).replace('/', '-')
-    fileName = str(fileName).replace(':', ';')
-    fileName = str(fileName).replace('*', '')
-    fileName = str(fileName).replace("'", '')
-    fileName = str(fileName).replace("?", '#')
-    fileName = str(fileName).replace('♥', '')
-    fileName = str(fileName).replace('’', '')
-
-    return fileName.strip()
+    return str(fileName).translate(FILENAME_FILTER_TRANSLATION).strip()

 def substituteTmdbFilename(fileName: str) -> str:
    """If chaining this method with filterFilename use this one first as the latter will destroy some patterns"""

-    # This indicates filler episodes in TMDB episode names
-    fileName = str(fileName).replace(' (*)', '')
-    fileName = str(fileName).replace('(*)', '')
+    normalizedFileName = str(fileName)

-    # This indicates the index of multi-episode files
-    episodePartMatch = re.search("\\(([0-9]+)\\)$", fileName)
+    for fillerMarker in TMDB_FILLER_MARKERS:
+        normalizedFileName = normalizedFileName.replace(fillerMarker, '')
+
+    episodeRangeMatch = TMDB_EPISODE_RANGE_SUFFIX_REGEX.search(normalizedFileName)
+    if episodeRangeMatch is not None:
+        partFirstIndex, partLastIndex = episodeRangeMatch.groups()
+        return TMDB_EPISODE_RANGE_SUFFIX_REGEX.sub(
+            f"Teil {partFirstIndex}-{partLastIndex}",
+            normalizedFileName,
+            count=1,
+        )
+
+    episodePartMatch = TMDB_EPISODE_PART_SUFFIX_REGEX.search(normalizedFileName)
    if episodePartMatch is not None:
-        partSuffix = str(episodePartMatch.group(0))
-        partIndex = episodePartMatch.groups()[0]
-        fileName = str(fileName).replace(partSuffix, f"Teil {partIndex}")
+        partIndex = episodePartMatch.group(1)
+        return TMDB_EPISODE_PART_SUFFIX_REGEX.sub(
+            f"Teil {partIndex}",
+            normalizedFileName,
+            count=1,
+        )

-    # Also multi-episodes with first and last episode index
-    episodePartMatch = re.search("\\(([0-9]+)[-\\/]([0-9]+)\\)$", fileName)
-    if episodePartMatch is not None:
-        partSuffix = str(episodePartMatch.group(0))
-        partFirstIndex = episodePartMatch.groups()[0]
-        partLastIndex = episodePartMatch.groups()[1]
-        fileName = str(fileName).replace(partSuffix, f"Teil {partFirstIndex}-{partLastIndex}")
-
-    return fileName
+    return normalizedFileName


 def getEpisodeFileBasename(showName,
@@ -231,7 +241,7 @@ def formatRichColor(text: str, color: str = None):
        return f"[{color}]{text}[/{color}]"

 def removeRichColor(text: str):
-    richColorMatch = re.search(RICH_COLOR_PATTERN, text)
+    richColorMatch = RICH_COLOR_REGEX.search(str(text))
    if richColorMatch is None:
        return text
    else:
--- a/tests/unit/test_helper.py
+++ b/tests/unit/test_helper.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from pathlib import Path
+import sys
+import unittest
+
+
+SRC_ROOT = Path(__file__).resolve().parents[2] / "src"
+
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+
+
+from ffx.helper import (  # noqa: E402
+    filterFilename,
+    formatRichColor,
+    removeRichColor,
+    substituteTmdbFilename,
+)
+
+
+class HelperTests(unittest.TestCase):
+    def test_filter_filename_replaces_and_removes_problem_characters(self):
+        self.assertEqual(
+            "A-B;C#",
+            filterFilename(" A/B:C*'?♥’ "),
+        )
+
+    def test_substitute_tmdb_filename_removes_filler_marker(self):
+        self.assertEqual(
+            "Episode Name",
+            substituteTmdbFilename("Episode Name (*)"),
+        )
+
+    def test_substitute_tmdb_filename_rewrites_single_episode_suffix(self):
+        self.assertEqual(
+            "Episode Name Teil 2",
+            substituteTmdbFilename("Episode Name (2)"),
+        )
+
+    def test_substitute_tmdb_filename_rewrites_episode_range_suffix(self):
+        self.assertEqual(
+            "Episode Name Teil 2-3",
+            substituteTmdbFilename("Episode Name (2/3)"),
+        )
+
+    def test_remove_rich_color_returns_inner_text(self):
+        self.assertEqual(
+            "value",
+            removeRichColor(formatRichColor("value", "green")),
+        )
+
+    def test_remove_rich_color_leaves_plain_text_unchanged(self):
+        self.assertEqual(
+            "plain text",
+            removeRichColor("plain text"),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()