Auto backup: 2026-02-21 07:01
This commit is contained in:
96
.venvs/transcribe/bin/srt-deduplicate
Executable file
96
.venvs/transcribe/bin/srt-deduplicate
Executable file
@@ -0,0 +1,96 @@
|
||||
#!/home/openclaw/.openclaw/workspace/.venvs/transcribe/bin/python3
|
||||
|
||||
"""Deduplicate repeated subtitles."""
|
||||
|
||||
import datetime
|
||||
import srt_tools.utils
|
||||
import logging
|
||||
import operator
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
try: # Python 2
|
||||
range = xrange # pytype: disable=name-error
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
|
||||
def parse_args():
|
||||
examples = {
|
||||
"Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt",
|
||||
"Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt",
|
||||
"Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt",
|
||||
}
|
||||
parser = srt_tools.utils.basic_parser(
|
||||
description=__doc__,
|
||||
examples=examples,
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--ms",
|
||||
metavar="MILLISECONDS",
|
||||
default=datetime.timedelta(milliseconds=5000),
|
||||
type=lambda ms: datetime.timedelta(milliseconds=int(ms)),
|
||||
help="how many milliseconds distance a subtitle start time must be "
|
||||
"within of another to be considered a duplicate "
|
||||
"(default: 5000ms)",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def deduplicate_subs(orig_subs, acceptable_diff):
|
||||
"""Remove subtitles with duplicated content."""
|
||||
indices_to_remove = []
|
||||
|
||||
# If we only store the subtitle itself and compare that, it's possible that
|
||||
# we'll not only remove the duplicate, but also the _original_ subtitle if
|
||||
# they have the same sub index/times/etc.
|
||||
#
|
||||
# As such, we need to also store the index in the original subs list that
|
||||
# this entry belongs to for each subtitle prior to sorting.
|
||||
sorted_subs = sorted(
|
||||
enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start)
|
||||
)
|
||||
|
||||
for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False):
|
||||
cur_idx, cur_sub = subs[0]
|
||||
next_idx, next_sub = subs[1]
|
||||
|
||||
if cur_sub.content == next_sub.content and (
|
||||
not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start
|
||||
):
|
||||
log.debug(
|
||||
"Marking l%d/s%d for removal, duplicate of l%d/s%d",
|
||||
next_idx,
|
||||
next_sub.index,
|
||||
cur_idx,
|
||||
cur_sub.index,
|
||||
)
|
||||
indices_to_remove.append(next_idx)
|
||||
|
||||
offset = 0
|
||||
for idx in indices_to_remove:
|
||||
del orig_subs[idx - offset]
|
||||
offset += 1
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
logging.basicConfig(level=args.log_level)
|
||||
|
||||
srt_tools.utils.set_basic_args(args)
|
||||
|
||||
subs = list(args.input)
|
||||
deduplicate_subs(subs, args.ms)
|
||||
|
||||
output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict)
|
||||
|
||||
try:
|
||||
args.output.write(output)
|
||||
except (UnicodeEncodeError, TypeError): # Python 2 fallback
|
||||
args.output.write(output.encode(args.encoding))
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
main()
|
||||
Reference in New Issue
Block a user