Auto backup: 2026-02-21 07:01

2026-02-21 07:01:51 +00:00
parent 8757148122
commit 17b5b82d99
2012 changed files with 352552 additions and 331 deletions
--- a/.venvs/transcribe/bin/srt-deduplicate
+++ b/.venvs/transcribe/bin/srt-deduplicate
@@ -0,0 +1,96 @@
+#!/home/openclaw/.openclaw/workspace/.venvs/transcribe/bin/python3
+
+"""Deduplicate repeated subtitles."""
+
+import datetime
+import srt_tools.utils
+import logging
+import operator
+
+log = logging.getLogger(__name__)
+
+try:  # Python 2
+    range = xrange  # pytype: disable=name-error
+except NameError:
+    pass
+
+
+def parse_args():
+    examples = {
+        "Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt",
+        "Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt",
+        "Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt",
+    }
+    parser = srt_tools.utils.basic_parser(
+        description=__doc__,
+        examples=examples,
+    )
+    parser.add_argument(
+        "-t",
+        "--ms",
+        metavar="MILLISECONDS",
+        default=datetime.timedelta(milliseconds=5000),
+        type=lambda ms: datetime.timedelta(milliseconds=int(ms)),
+        help="how many milliseconds distance a subtitle start time must be "
+        "within of another to be considered a duplicate "
+        "(default: 5000ms)",
+    )
+
+    return parser.parse_args()
+
+
+def deduplicate_subs(orig_subs, acceptable_diff):
+    """Remove subtitles with duplicated content."""
+    indices_to_remove = []
+
+    # If we only store the subtitle itself and compare that, it's possible that
+    # we'll not only remove the duplicate, but also the _original_ subtitle if
+    # they have the same sub index/times/etc.
+    #
+    # As such, we need to also store the index in the original subs list that
+    # this entry belongs to for each subtitle prior to sorting.
+    sorted_subs = sorted(
+        enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start)
+    )
+
+    for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False):
+        cur_idx, cur_sub = subs[0]
+        next_idx, next_sub = subs[1]
+
+        if cur_sub.content == next_sub.content and (
+            not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start
+        ):
+            log.debug(
+                "Marking l%d/s%d for removal, duplicate of l%d/s%d",
+                next_idx,
+                next_sub.index,
+                cur_idx,
+                cur_sub.index,
+            )
+            indices_to_remove.append(next_idx)
+
+    offset = 0
+    for idx in indices_to_remove:
+        del orig_subs[idx - offset]
+        offset += 1
+
+
+def main():
+    args = parse_args()
+    logging.basicConfig(level=args.log_level)
+
+    srt_tools.utils.set_basic_args(args)
+
+    subs = list(args.input)
+    deduplicate_subs(subs, args.ms)
+
+    output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict)
+
+    try:
+        args.output.write(output)
+    except (UnicodeEncodeError, TypeError):  # Python 2 fallback
+        args.output.write(output.encode(args.encoding))
+
+
+if __name__ == "__main__":  # pragma: no cover
+    main()