From 0939a0c6c2d9288ad10234ae8f9ad114521e7ffc Mon Sep 17 00:00:00 2001 From: Javanaut Date: Sat, 11 Apr 2026 16:00:01 +0200 Subject: [PATCH] Optimizes ffprobe usage --- .gitignore | 5 + SCRATCHPAD.md | 32 +++---- requirements/tests.md | 15 ++- src/ffx/file_properties.py | 57 +++++------- tests/unit/test_file_properties_probe.py | 111 +++++++++++++++++++++++ 5 files changed, 161 insertions(+), 59 deletions(-) create mode 100644 tests/unit/test_file_properties_probe.py diff --git a/.gitignore b/.gitignore index 913e25a..cbc9287 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,8 @@ dist/ .venv/ venv/ .codex + + +*.mkv +*.webm +ffmpeg2pass-0.log diff --git a/SCRATCHPAD.md b/SCRATCHPAD.md index 4864883..fa4d84b 100644 --- a/SCRATCHPAD.md +++ b/SCRATCHPAD.md @@ -11,6 +11,7 @@ - A first modern integration slice now exists under [`tests/integration/subtrack_mapping`](/home/osgw/.local/src/codex/ffx/tests/integration/subtrack_mapping). Remaining test-suite cleanup is now mostly about migrating and shrinking the legacy harness surface under [`tests/legacy`](/home/osgw/.local/src/codex/ffx/tests/legacy). - The CLI root now lazy-loads heavy runtime dependencies so lightweight commands such as `version`, `help`, `configure_workstation`, and `upgrade` stay import-light. - Shared CLI defaults for container/output tokens now live outside [`src/ffx/ffx_controller.py`](/home/osgw/.local/src/codex/ffx/src/ffx/ffx_controller.py), and a focused unit test locks in the lazy-import contract. +- `FileProperties` now uses one cached `ffprobe -show_format -show_streams -of json` call per source file, and the combined payload was confirmed against the Dragonball asset to satisfy both previous probe call sites fully. - FFX logger setup now reuses named handlers, and fallback logger access no longer mutates handlers in ordinary constructors and helpers. - The process wrapper now uses `subprocess.run(...)` with centralized command formatting plus stable timeout and missing-command error mapping. - Active ORM controllers now use single-query accessors instead of paired `count()` plus `first()` lookups. @@ -19,8 +20,7 @@ ## Focused Snapshot - Highest-leverage application optimizations: - - Collapse repeated `ffprobe` calls into a single probe result per source file. - - Revisit crop detection cost after the probe path is consolidated. + - Revisit crop detection cost now that the probe path is consolidated. - Highest-leverage repo and workflow optimizations: - Consolidate setup and upgrade tooling to reduce overlapping shell-script responsibilities. @@ -28,16 +28,7 @@ ## Optimization Candidates -1. Media probing does two separate `ffprobe` subprocesses per file -- [`src/ffx/file_properties.py`](/home/osgw/.local/src/codex/ffx/src/ffx/file_properties.py) calls `ffprobe` once for format data and once for stream data. -- Optimization: - - Use one probe call that requests both format and streams. - - Cache that result inside `FileProperties`. -- Expected value: - - Less subprocess overhead. - - Faster inspect and convert flows. - -2. Crop detection is always a full extra ffmpeg scan +1. Crop detection is always a full extra ffmpeg scan - [`src/ffx/file_properties.py`](/home/osgw/.local/src/codex/ffx/src/ffx/file_properties.py) runs a dedicated `ffmpeg -vf cropdetect` pass for each file when crop detection is requested. - Optimization: - Cache crop results for repeated runs on the same source. @@ -45,7 +36,7 @@ - Expected value: - Lower latency on repeated experimentation. -3. Tooling overlap and naming drift +2. Tooling overlap and naming drift - There are still overlapping workstation-setup entrypoints across [`tools/configure_workstation.sh`](/home/osgw/.local/src/codex/ffx/tools/configure_workstation.sh), [`tools/setup.sh`](/home/osgw/.local/src/codex/ffx/tools/setup.sh), and newer CLI maintenance commands. - Optimization: - Decide which scripts remain canonical. @@ -55,7 +46,7 @@ - Less operator confusion. - Fewer duplicated procedures to maintain. -4. Placeholder UI surfaces should either ship or disappear +3. Placeholder UI surfaces should either ship or disappear - [`src/ffx/help_screen.py`](/home/osgw/.local/src/codex/ffx/src/ffx/help_screen.py) and [`src/ffx/settings_screen.py`](/home/osgw/.local/src/codex/ffx/src/ffx/settings_screen.py) are placeholders. - Optimization: - Either remove them from the active UI surface or complete them. @@ -64,7 +55,7 @@ - Leaner interface. - Lower UX ambiguity. -5. Large Textual screens repeat configuration and controller loading +4. Large Textual screens repeat configuration and controller loading - Screens such as [`src/ffx/media_details_screen.py`](/home/osgw/.local/src/codex/ffx/src/ffx/media_details_screen.py), [`src/ffx/pattern_details_screen.py`](/home/osgw/.local/src/codex/ffx/src/ffx/pattern_details_screen.py), and [`src/ffx/show_details_screen.py`](/home/osgw/.local/src/codex/ffx/src/ffx/show_details_screen.py) repeat setup patterns and local metadata filtering extraction. - Optimization: - Extract a shared screen base or helper for common config/controller/bootstrap logic. @@ -73,7 +64,7 @@ - Lower maintenance overhead. - Easier UI iteration. -6. Several helper functions are unfinished or dead-weight +5. Several helper functions are unfinished or dead-weight - [`src/ffx/helper.py`](/home/osgw/.local/src/codex/ffx/src/ffx/helper.py) contains `permutateList(...): pass`. - There are many combinator and conversion placeholders across tests and migrations. - Optimization: @@ -83,7 +74,7 @@ - Smaller mental model. - Less time spent re-evaluating inactive paths. -7. Test suite shape is expensive to understand and likely expensive to run +6. Test suite shape is expensive to understand and likely expensive to run - The project still carries a large legacy matrix of combinator files under [`tests/legacy`](/home/osgw/.local/src/codex/ffx/tests/legacy), several placeholder `pass` implementations, and at least one suspicious filename with an embedded space: [`tests/legacy/disposition_combinator_2_3 .py`](/home/osgw/.local/src/codex/ffx/tests/legacy/disposition_combinator_2_3 .py). - A first focused replacement slice now exists in [`tests/integration/subtrack_mapping/test_cli_bundle.py`](/home/osgw/.local/src/codex/ffx/tests/integration/subtrack_mapping/test_cli_bundle.py), so the remaining work is migration and consolidation rather than creating the modern test shape from scratch. - Optimization: @@ -94,7 +85,7 @@ - Faster contributor onboarding. - Easier CI adoption later. -8. Process resource limiting semantics could be clearer +7. Process resource limiting semantics could be clearer - [`src/ffx/process.py`](/home/osgw/.local/src/codex/ffx/src/ffx/process.py) prepends `nice` and `cpulimit` directly when values are set. - Optimization: - Validate and document effective behavior for combined `nice` + `cpulimit`. @@ -103,7 +94,7 @@ - Fewer surprises in production-like runs. - Easier support for user-reported performance behavior. -9. Regex and string utility cleanup +8. Regex and string utility cleanup - [`src/ffx/helper.py`](/home/osgw/.local/src/codex/ffx/src/ffx/helper.py) still has repeated string-replacement churn in filename/TMDB normalization helpers, and regex handling in helpers is easy to regress quietly. - Optimization: - Keep regex literals raw and centralized where appropriate. @@ -112,7 +103,7 @@ - Cleaner runtime output. - Less warning noise during dry-run maintenance commands. -10. Database startup always runs schema creation and version checks +9. Database startup always runs schema creation and version checks - [`src/ffx/database.py`](/home/osgw/.local/src/codex/ffx/src/ffx/database.py) runs `Base.metadata.create_all(...)` and version checks every time a DB-backed context is created. - Optimization: - Measure startup cost and consider separating bootstrapping from ordinary command execution. @@ -136,7 +127,6 @@ 1. Triage the list into quick wins, medium refactors, and long-horizon cleanup. 2. Tackle the cheapest high-impact items first: - - single-call `ffprobe` refactor. - crop detection sampling or caching pass. 3. Decide which setup and upgrade entrypoints stay canonical before adding more maintenance surface. diff --git a/requirements/tests.md b/requirements/tests.md index 933f28e..61c269c 100644 --- a/requirements/tests.md +++ b/requirements/tests.md @@ -7,9 +7,16 @@ Detailed product rules for source-to-target subtrack mapping live in `requirements/subtrack_mapping.md`. This file describes only how tests cover that area. +## Interpreter Requirement + +- Agents shall run Python-side test commands with `~/.local/share/ffx.venv/bin/python`. +- This applies to the legacy harness, `unittest`, `pytest`, helper scripts, and `python -m ffx ...` test invocations. +- Agents shall not silently substitute `python`, `python3`, or another interpreter for Python-side test work. +- If `~/.local/share/ffx.venv/bin/python` is missing or not executable, agents shall stop and report the missing venv instead of continuing with Python-side test execution. + ## Current Harness -- Entrypoint: `python tests/legacy_runner.py run` +- Entrypoint: `~/.local/share/ffx.venv/bin/python tests/legacy_runner.py run` - Runner style: custom Click CLI, not `pytest` or `unittest` - Commands: - `run`: discover scenario files, instantiate each scenario, run yielded jobs @@ -35,7 +42,7 @@ that area. - inputs per job: `1` - jobs: `140` - expected failures: `0` - - execution: build one synthetic source file, run `python -m ffx convert`, assert filename selectors only + - execution: build one synthetic source file, run `~/.local/share/ffx.venv/bin/python -m ffx convert`, assert filename selectors only - selectors executed: `B`, `L`, `I` - selectors defined but not executed: `S`, `R` - `2`: `tests/legacy/scenario_2.py` @@ -43,7 +50,7 @@ that area. - inputs per job: `1` - jobs: `8193` - expected failures: `3267` - - execution: build one synthetic source file, run `python -m ffx convert`, probe result with `FileProperties`, assert track layout and selected audio and subtitle metadata + - execution: build one synthetic source file, run `~/.local/share/ffx.venv/bin/python -m ffx convert`, probe result with `FileProperties`, assert track layout and selected audio and subtitle metadata - selectors executed: `M`, `AD`, `AT`, `SD`, `ST` - selectors defined but not executed: `MT`, `AP`, `SP`, `J` - `4`: `tests/legacy/scenario_4.py` @@ -51,7 +58,7 @@ that area. - inputs per job: `6` - jobs: `768` - expected failures: `336` - - execution: build six synthetic preset files, recreate temp SQLite DB, insert show and pattern, run one batch convert command, query TMDB during assertions + - execution: build six synthetic preset files, recreate temp SQLite DB, insert show and pattern, run one batch convert command via `~/.local/share/ffx.venv/bin/python`, query TMDB during assertions - selectors executed: `M`, `AD`, `AT`, `SD`, `ST` - selectors defined but not executed: `MT`, `AP`, `SP`, `J` - notes: diff --git a/src/ffx/file_properties.py b/src/ffx/file_properties.py index 09f676e..1b45a06 100644 --- a/src/ffx/file_properties.py +++ b/src/ffx/file_properties.py @@ -13,6 +13,7 @@ from ffx.model.pattern import Pattern class FileProperties(): FILE_EXTENSIONS = ['mkv', 'mp4', 'avi', 'flv', 'webm'] + FFPROBE_COMMAND_TOKENS = ["ffprobe", "-hide_banner", "-show_format", "-show_streams", "-of", "json"] SE_INDICATOR_PATTERN = '([sS][0-9]+[eE][0-9]+)' SEASON_EPISODE_INDICATOR_MATCH = '[sS]([0-9]+)[eE]([0-9]+)' @@ -78,6 +79,26 @@ class FileProperties(): self.__season = -1 self.__episode = -1 + self.__ffprobeData = None + + + def _getFfprobeData(self): + if self.__ffprobeData is not None: + return self.__ffprobeData + + ffprobeOutput, ffprobeError, returnCode = executeProcess( + FileProperties.FFPROBE_COMMAND_TOKENS + [self.__sourcePath] + ) + + if 'Invalid data found when processing input' in ffprobeError: + raise Exception(f"File {self.__sourcePath} does not contain valid stream data") + + if returnCode != 0: + raise Exception(f"ffprobe returned with error {returnCode}") + + self.__ffprobeData = json.loads(ffprobeOutput) + return self.__ffprobeData + def getFormatData(self): """ @@ -99,22 +120,7 @@ class FileProperties(): } } """ - - # ffprobe -hide_banner -show_format -of json - ffprobeOutput, ffprobeError, returnCode = executeProcess(["ffprobe", - "-hide_banner", - "-show_format", - "-of", "json", - self.__sourcePath]) #, - #context = self.context) - - if 'Invalid data found when processing input' in ffprobeError: - raise Exception(f"File {self.__sourcePath} does not contain valid stream data") - - if returnCode != 0: - raise Exception(f"ffprobe returned with error {returnCode}") - - return json.loads(ffprobeOutput)['format'] + return self._getFfprobeData()['format'] def getStreamData(self): @@ -159,24 +165,7 @@ class FileProperties(): } } """ - - # ffprobe -hide_banner -show_streams -of json - ffprobeOutput, ffprobeError, returnCode = executeProcess(["ffprobe", - "-hide_banner", - "-show_streams", - "-of", "json", - self.__sourcePath]) #, - #context = self.context) - - if 'Invalid data found when processing input' in ffprobeError: - raise Exception(f"File {self.__sourcePath} does not contain valid stream data") - - - if returnCode != 0: - raise Exception(f"ffprobe returned with error {returnCode}") - - - return json.loads(ffprobeOutput)['streams'] + return self._getFfprobeData()['streams'] diff --git a/tests/unit/test_file_properties_probe.py b/tests/unit/test_file_properties_probe.py new file mode 100644 index 0000000..6961d0a --- /dev/null +++ b/tests/unit/test_file_properties_probe.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import json +import logging +from pathlib import Path +import sys +import unittest +from unittest.mock import patch + + +SRC_ROOT = Path(__file__).resolve().parents[2] / "src" + +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + + +class StaticConfig: + def getData(self): + return {} + + +class DummyPatternController: + def __init__(self, context): + self.context = context + + def matchFilename(self, filename): + return {} + + +def make_logger(name: str) -> logging.Logger: + logger = logging.getLogger(name) + logger.handlers = [] + logger.setLevel(logging.DEBUG) + logger.propagate = False + logger.addHandler(logging.NullHandler()) + return logger + + +class FilePropertiesProbeTests(unittest.TestCase): + def import_module(self): + try: + import ffx.file_properties as file_properties_module + except ModuleNotFoundError as ex: + if ex.name == "sqlalchemy": + self.skipTest("sqlalchemy is not installed in this environment") + raise + return file_properties_module + + def make_context(self): + return { + "logger": make_logger("ffx-test-file-properties-probe"), + "config": StaticConfig(), + "database": {"session": object()}, + "use_pattern": False, + } + + def sample_probe_data(self): + return { + "format": { + "filename": "/tmp/example_s01e01.mkv", + "nb_streams": 2, + "format_name": "matroska,webm", + }, + "streams": [ + { + "index": 0, + "codec_name": "h264", + "codec_type": "video", + "disposition": {"default": 1}, + "tags": {}, + }, + { + "index": 1, + "codec_name": "aac", + "codec_type": "audio", + "channel_layout": "stereo", + "channels": 2, + "disposition": {"default": 0}, + "tags": {"language": "eng"}, + }, + ], + } + + def test_format_and_stream_accessors_share_one_combined_probe(self): + file_properties_module = self.import_module() + probe_output = self.sample_probe_data() + + with ( + patch.object(file_properties_module, "PatternController", DummyPatternController), + patch.object( + file_properties_module, + "executeProcess", + return_value=(json.dumps(probe_output), "", 0), + ) as mocked_execute, + ): + file_properties = file_properties_module.FileProperties( + self.make_context(), + "/tmp/example_s01e01.mkv", + ) + + self.assertEqual(probe_output["format"], file_properties.getFormatData()) + self.assertEqual(probe_output["streams"], file_properties.getStreamData()) + + mocked_execute.assert_called_once_with( + file_properties_module.FileProperties.FFPROBE_COMMAND_TOKENS + + ["/tmp/example_s01e01.mkv"] + ) + + +if __name__ == "__main__": + unittest.main()