Auto backup: 2026-02-21 07:01
This commit is contained in:
511
.venvs/transcribe/lib/python3.11/site-packages/srt.py
Normal file
511
.venvs/transcribe/lib/python3.11/site-packages/srt.py
Normal file
@@ -0,0 +1,511 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf8
|
||||
|
||||
"""A tiny library for parsing, modifying, and composing SRT files."""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
import functools
|
||||
import re
|
||||
from datetime import timedelta
|
||||
import logging
|
||||
import io
|
||||
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
# "." is not technically valid as a delimiter, but many editors create SRT
|
||||
# files with this delimiter for whatever reason. Many editors and players
|
||||
# accept it, so we do too.
|
||||
RGX_TIMESTAMP_MAGNITUDE_DELIM = r"[,.:,.。:]"
|
||||
RGX_TIMESTAMP_FIELD = r"[0-9]+"
|
||||
RGX_TIMESTAMP_FIELD_OPTIONAL = r"[0-9]*"
|
||||
RGX_TIMESTAMP = "".join(
|
||||
[
|
||||
RGX_TIMESTAMP_MAGNITUDE_DELIM.join([RGX_TIMESTAMP_FIELD] * 3),
|
||||
RGX_TIMESTAMP_MAGNITUDE_DELIM,
|
||||
"?",
|
||||
RGX_TIMESTAMP_FIELD_OPTIONAL,
|
||||
]
|
||||
)
|
||||
RGX_TIMESTAMP_PARSEABLE = r"^{}$".format(
|
||||
"".join(
|
||||
[
|
||||
RGX_TIMESTAMP_MAGNITUDE_DELIM.join(["(" + RGX_TIMESTAMP_FIELD + ")"] * 3),
|
||||
RGX_TIMESTAMP_MAGNITUDE_DELIM,
|
||||
"?",
|
||||
"(",
|
||||
RGX_TIMESTAMP_FIELD_OPTIONAL,
|
||||
")",
|
||||
]
|
||||
)
|
||||
)
|
||||
RGX_INDEX = r"-?[0-9]+\.?[0-9]*"
|
||||
RGX_PROPRIETARY = r"[^\r\n]*"
|
||||
RGX_CONTENT = r".*?"
|
||||
RGX_POSSIBLE_CRLF = r"\r?\n"
|
||||
|
||||
TS_REGEX = re.compile(RGX_TIMESTAMP_PARSEABLE)
|
||||
MULTI_WS_REGEX = re.compile(r"\n\n+")
|
||||
SRT_REGEX = re.compile(
|
||||
r"\s*(?:({idx})\s*{eof})?({ts}) *-[ -] *> *({ts}) ?({proprietary})(?:{eof}|\Z)({content})"
|
||||
# Many sub editors don't add a blank line to the end, and many editors and
|
||||
# players accept that. We allow it to be missing in input.
|
||||
#
|
||||
# We also allow subs that are missing a double blank newline. This often
|
||||
# happens on subs which were first created as a mixed language subtitle,
|
||||
# for example chs/eng, and then were stripped using naive methods (such as
|
||||
# ed/sed) that don't understand newline preservation rules in SRT files.
|
||||
#
|
||||
# This means that when you are, say, only keeping chs, and the line only
|
||||
# contains english, you end up with not only no content, but also all of
|
||||
# the content lines are stripped instead of retaining a newline.
|
||||
r"(?:{eof}|\Z)(?:{eof}|\Z|(?=(?:{idx}\s*{eof}{ts})))"
|
||||
# Some SRT blocks, while this is technically invalid, have blank lines
|
||||
# inside the subtitle content. We look ahead a little to check that the
|
||||
# next lines look like an index and a timestamp as a best-effort
|
||||
# solution to work around these.
|
||||
r"(?=(?:(?:{idx}\s*{eof})?{ts}|\Z))".format(
|
||||
idx=RGX_INDEX,
|
||||
ts=RGX_TIMESTAMP,
|
||||
proprietary=RGX_PROPRIETARY,
|
||||
content=RGX_CONTENT,
|
||||
eof=RGX_POSSIBLE_CRLF,
|
||||
),
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
ZERO_TIMEDELTA = timedelta(0)
|
||||
|
||||
# Info message if truthy return -> Function taking a Subtitle, skip if True
|
||||
SUBTITLE_SKIP_CONDITIONS = (
|
||||
("No content", lambda sub: not sub.content.strip()),
|
||||
("Start time < 0 seconds", lambda sub: sub.start < ZERO_TIMEDELTA),
|
||||
("Subtitle start time >= end time", lambda sub: sub.start >= sub.end),
|
||||
)
|
||||
|
||||
SECONDS_IN_HOUR = 3600
|
||||
SECONDS_IN_MINUTE = 60
|
||||
HOURS_IN_DAY = 24
|
||||
MICROSECONDS_IN_MILLISECOND = 1000
|
||||
|
||||
try:
|
||||
FILE_TYPES = (file, io.IOBase) # pytype: disable=name-error
|
||||
except NameError: # `file` doesn't exist in Python 3
|
||||
FILE_TYPES = (io.IOBase,)
|
||||
|
||||
|
||||
@functools.total_ordering
|
||||
class Subtitle(object):
|
||||
r"""
|
||||
The metadata relating to a single subtitle. Subtitles are sorted by start
|
||||
time by default. If no index was provided, index 0 will be used on writing
|
||||
an SRT block.
|
||||
|
||||
:param index: The SRT index for this subtitle
|
||||
:type index: int or None
|
||||
:param start: The time that the subtitle should start being shown
|
||||
:type start: :py:class:`datetime.timedelta`
|
||||
:param end: The time that the subtitle should stop being shown
|
||||
:type end: :py:class:`datetime.timedelta`
|
||||
:param str proprietary: Proprietary metadata for this subtitle
|
||||
:param str content: The subtitle content. Should not contain OS-specific
|
||||
line separators, only \\n. This is taken care of
|
||||
already if you use :py:func:`srt.parse` to generate
|
||||
Subtitle objects.
|
||||
"""
|
||||
|
||||
# pylint: disable=R0913
|
||||
def __init__(self, index, start, end, content, proprietary=""):
|
||||
self.index = index
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.content = content
|
||||
self.proprietary = proprietary
|
||||
|
||||
def __hash__(self):
|
||||
return hash(frozenset(vars(self).items()))
|
||||
|
||||
def __eq__(self, other):
|
||||
return vars(self) == vars(other)
|
||||
|
||||
def __lt__(self, other):
|
||||
return (self.start, self.end, self.index) < (
|
||||
other.start,
|
||||
other.end,
|
||||
other.index,
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
# Python 2/3 cross compatibility
|
||||
var_items = getattr(vars(self), "iteritems", getattr(vars(self), "items"))
|
||||
item_list = ", ".join("%s=%r" % (k, v) for k, v in var_items())
|
||||
return "%s(%s)" % (type(self).__name__, item_list)
|
||||
|
||||
def to_srt(self, strict=True, eol="\n"):
|
||||
r"""
|
||||
Convert the current :py:class:`Subtitle` to an SRT block.
|
||||
|
||||
:param bool strict: If disabled, will allow blank lines in the content
|
||||
of the SRT block, which is a violation of the SRT
|
||||
standard and may cause your media player to explode
|
||||
:param str eol: The end of line string to use (default "\\n")
|
||||
:returns: The metadata of the current :py:class:`Subtitle` object as an
|
||||
SRT formatted subtitle block
|
||||
:rtype: str
|
||||
"""
|
||||
output_content = self.content
|
||||
output_proprietary = self.proprietary
|
||||
|
||||
if output_proprietary:
|
||||
# output_proprietary is output directly next to the timestamp, so
|
||||
# we need to add the space as a field delimiter.
|
||||
output_proprietary = " " + output_proprietary
|
||||
|
||||
if strict:
|
||||
output_content = make_legal_content(output_content)
|
||||
|
||||
if eol is None:
|
||||
eol = "\n"
|
||||
elif eol != "\n":
|
||||
output_content = output_content.replace("\n", eol)
|
||||
|
||||
template = "{idx}{eol}{start} --> {end}{prop}{eol}{content}{eol}{eol}"
|
||||
return template.format(
|
||||
idx=self.index or 0,
|
||||
start=timedelta_to_srt_timestamp(self.start),
|
||||
end=timedelta_to_srt_timestamp(self.end),
|
||||
prop=output_proprietary,
|
||||
content=output_content,
|
||||
eol=eol,
|
||||
)
|
||||
|
||||
|
||||
def make_legal_content(content):
|
||||
r"""
|
||||
Remove illegal content from a content block. Illegal content includes:
|
||||
|
||||
* Blank lines
|
||||
* Starting or ending with a blank line
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> make_legal_content('\nfoo\n\nbar\n')
|
||||
'foo\nbar'
|
||||
|
||||
:param str content: The content to make legal
|
||||
:returns: The legalised content
|
||||
:rtype: srt
|
||||
"""
|
||||
# Optimisation: Usually the content we get is legally valid. Do a quick
|
||||
# check to see if we really need to do anything here. This saves time from
|
||||
# generating legal_content by about 50%.
|
||||
if content and content[0] != "\n" and "\n\n" not in content:
|
||||
return content
|
||||
|
||||
legal_content = MULTI_WS_REGEX.sub("\n", content.strip("\n"))
|
||||
LOG.info("Legalised content %r to %r", content, legal_content)
|
||||
return legal_content
|
||||
|
||||
|
||||
def timedelta_to_srt_timestamp(timedelta_timestamp):
|
||||
r"""
|
||||
Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> import datetime
|
||||
>>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
|
||||
>>> timedelta_to_srt_timestamp(delta)
|
||||
'01:23:04,000'
|
||||
|
||||
:param datetime.timedelta timedelta_timestamp: A datetime to convert to an
|
||||
SRT timestamp
|
||||
:returns: The timestamp in SRT format
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
|
||||
hrs += timedelta_timestamp.days * HOURS_IN_DAY
|
||||
mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
|
||||
msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
|
||||
return "%02d:%02d:%02d,%03d" % (hrs, mins, secs, msecs)
|
||||
|
||||
|
||||
def srt_timestamp_to_timedelta(timestamp):
|
||||
r"""
|
||||
Convert an SRT timestamp to a :py:class:`~datetime.timedelta`.
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> srt_timestamp_to_timedelta('01:23:04,000')
|
||||
datetime.timedelta(seconds=4984)
|
||||
|
||||
:param str timestamp: A timestamp in SRT format
|
||||
:returns: The timestamp as a :py:class:`~datetime.timedelta`
|
||||
:rtype: datetime.timedelta
|
||||
:raises TimestampParseError: If the timestamp is not parseable
|
||||
"""
|
||||
|
||||
match = TS_REGEX.match(timestamp)
|
||||
if match is None:
|
||||
raise TimestampParseError("Unparseable timestamp: {}".format(timestamp))
|
||||
hrs, mins, secs, msecs = [int(m) if m else 0 for m in match.groups()]
|
||||
return timedelta(hours=hrs, minutes=mins, seconds=secs, milliseconds=msecs)
|
||||
|
||||
|
||||
def sort_and_reindex(subtitles, start_index=1, in_place=False, skip=True):
|
||||
"""
|
||||
Reorder subtitles to be sorted by start time order, and rewrite the indexes
|
||||
to be in that same order. This ensures that the SRT file will play in an
|
||||
expected fashion after, for example, times were changed in some subtitles
|
||||
and they may need to be resorted.
|
||||
|
||||
If skip=True, subtitles will also be skipped if they are considered not to
|
||||
be useful. Currently, the conditions to be considered "not useful" are as
|
||||
follows:
|
||||
|
||||
- Content is empty, or only whitespace
|
||||
- The start time is negative
|
||||
- The start time is equal to or later than the end time
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> from datetime import timedelta
|
||||
>>> one = timedelta(seconds=1)
|
||||
>>> two = timedelta(seconds=2)
|
||||
>>> three = timedelta(seconds=3)
|
||||
>>> subs = [
|
||||
... Subtitle(index=999, start=one, end=two, content='1'),
|
||||
... Subtitle(index=0, start=two, end=three, content='2'),
|
||||
... ]
|
||||
>>> list(sort_and_reindex(subs)) # doctest: +ELLIPSIS
|
||||
[Subtitle(...index=1...), Subtitle(...index=2...)]
|
||||
|
||||
:param subtitles: :py:class:`Subtitle` objects in any order
|
||||
:param int start_index: The index to start from
|
||||
:param bool in_place: Whether to modify subs in-place for performance
|
||||
(version <=1.0.0 behaviour)
|
||||
:param bool skip: Whether to skip subtitles considered not useful (see
|
||||
above for rules)
|
||||
:returns: The sorted subtitles
|
||||
:rtype: :term:`generator` of :py:class:`Subtitle` objects
|
||||
"""
|
||||
skipped_subs = 0
|
||||
for sub_num, subtitle in enumerate(sorted(subtitles), start=start_index):
|
||||
if not in_place:
|
||||
subtitle = Subtitle(**vars(subtitle))
|
||||
|
||||
if skip:
|
||||
try:
|
||||
_should_skip_sub(subtitle)
|
||||
except _ShouldSkipException as thrown_exc:
|
||||
if subtitle.index is None:
|
||||
LOG.info("Skipped subtitle with no index: %s", thrown_exc)
|
||||
else:
|
||||
LOG.info(
|
||||
"Skipped subtitle at index %d: %s", subtitle.index, thrown_exc
|
||||
)
|
||||
skipped_subs += 1
|
||||
continue
|
||||
|
||||
subtitle.index = sub_num - skipped_subs
|
||||
|
||||
yield subtitle
|
||||
|
||||
|
||||
def _should_skip_sub(subtitle):
|
||||
"""
|
||||
Check if a subtitle should be skipped based on the rules in
|
||||
SUBTITLE_SKIP_CONDITIONS.
|
||||
|
||||
:param subtitle: A :py:class:`Subtitle` to check whether to skip
|
||||
:raises _ShouldSkipException: If the subtitle should be skipped
|
||||
"""
|
||||
for info_msg, sub_skipper in SUBTITLE_SKIP_CONDITIONS:
|
||||
if sub_skipper(subtitle):
|
||||
raise _ShouldSkipException(info_msg)
|
||||
|
||||
|
||||
def parse(srt, ignore_errors=False):
|
||||
r'''
|
||||
Convert an SRT formatted string (in Python 2, a :class:`unicode` object) to
|
||||
a :term:`generator` of Subtitle objects.
|
||||
|
||||
This function works around bugs present in many SRT files, most notably
|
||||
that it is designed to not bork when presented with a blank line as part of
|
||||
a subtitle's content.
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> subs = parse("""\
|
||||
... 422
|
||||
... 00:31:39,931 --> 00:31:41,931
|
||||
... Using mainly spoons,
|
||||
...
|
||||
... 423
|
||||
... 00:31:41,933 --> 00:31:43,435
|
||||
... we dig a tunnel under the city and release it into the wild.
|
||||
...
|
||||
... """)
|
||||
>>> list(subs) # doctest: +ELLIPSIS
|
||||
[Subtitle(...index=422...), Subtitle(...index=423...)]
|
||||
|
||||
:param srt: Subtitles in SRT format
|
||||
:type srt: str or a file-like object
|
||||
:param ignore_errors: If True, garbled SRT data will be ignored, and we'll
|
||||
continue trying to parse the rest of the file,
|
||||
instead of raising :py:class:`SRTParseError` and
|
||||
stopping execution.
|
||||
:returns: The subtitles contained in the SRT file as :py:class:`Subtitle`
|
||||
objects
|
||||
:rtype: :term:`generator` of :py:class:`Subtitle` objects
|
||||
:raises SRTParseError: If the matches are not contiguous and
|
||||
``ignore_errors`` is False.
|
||||
'''
|
||||
|
||||
expected_start = 0
|
||||
|
||||
# Transparently read files -- the whole thing is needed for regex's
|
||||
# finditer
|
||||
if isinstance(srt, FILE_TYPES):
|
||||
srt = srt.read()
|
||||
|
||||
for match in SRT_REGEX.finditer(srt):
|
||||
actual_start = match.start()
|
||||
_check_contiguity(srt, expected_start, actual_start, ignore_errors)
|
||||
raw_index, raw_start, raw_end, proprietary, content = match.groups()
|
||||
|
||||
# pytype sees that this is Optional[str] and thus complains that they
|
||||
# can be None, but they can't realistically be None, since we're using
|
||||
# finditer and all match groups are mandatory in the regex.
|
||||
content = content.replace("\r\n", "\n") # pytype: disable=attribute-error
|
||||
|
||||
try:
|
||||
raw_index = int(raw_index)
|
||||
except ValueError:
|
||||
# Index 123.4. Handled separately, since it's a rare case and we
|
||||
# don't want to affect general performance.
|
||||
#
|
||||
# The pytype disable is for the same reason as content, above.
|
||||
raw_index = int(raw_index.split(".")[0]) # pytype: disable=attribute-error
|
||||
except TypeError:
|
||||
# There's no index, so raw_index is already set to None. We'll
|
||||
# handle this when rendering the subtitle with to_srt.
|
||||
pass
|
||||
|
||||
yield Subtitle(
|
||||
index=raw_index,
|
||||
start=srt_timestamp_to_timedelta(raw_start),
|
||||
end=srt_timestamp_to_timedelta(raw_end),
|
||||
content=content,
|
||||
proprietary=proprietary,
|
||||
)
|
||||
|
||||
expected_start = match.end()
|
||||
|
||||
_check_contiguity(srt, expected_start, len(srt), ignore_errors)
|
||||
|
||||
|
||||
def _check_contiguity(srt, expected_start, actual_start, warn_only):
|
||||
"""
|
||||
If ``warn_only`` is False, raise :py:class:`SRTParseError` with diagnostic
|
||||
info if expected_start does not equal actual_start. Otherwise, log a
|
||||
warning.
|
||||
|
||||
:param str srt: The data being matched
|
||||
:param int expected_start: The expected next start, as from the last
|
||||
iteration's match.end()
|
||||
:param int actual_start: The actual start, as from this iteration's
|
||||
match.start()
|
||||
:raises SRTParseError: If the matches are not contiguous and ``warn_only``
|
||||
is False
|
||||
"""
|
||||
if expected_start != actual_start:
|
||||
unmatched_content = srt[expected_start:actual_start]
|
||||
|
||||
if expected_start == 0 and (
|
||||
unmatched_content.isspace() or unmatched_content == "\ufeff"
|
||||
):
|
||||
# #50: Leading whitespace has nowhere to be captured like in an
|
||||
# intermediate subtitle
|
||||
return
|
||||
|
||||
if warn_only:
|
||||
LOG.warning("Skipped unparseable SRT data: %r", unmatched_content)
|
||||
else:
|
||||
raise SRTParseError(expected_start, actual_start, unmatched_content)
|
||||
|
||||
|
||||
def compose(
|
||||
subtitles, reindex=True, start_index=1, strict=True, eol=None, in_place=False
|
||||
):
|
||||
r"""
|
||||
Convert an iterator of :py:class:`Subtitle` objects to a string of joined
|
||||
SRT blocks.
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> from datetime import timedelta
|
||||
>>> start = timedelta(seconds=1)
|
||||
>>> end = timedelta(seconds=2)
|
||||
>>> subs = [
|
||||
... Subtitle(index=1, start=start, end=end, content='x'),
|
||||
... Subtitle(index=2, start=start, end=end, content='y'),
|
||||
... ]
|
||||
>>> compose(subs) # doctest: +ELLIPSIS
|
||||
'1\n00:00:01,000 --> 00:00:02,000\nx\n\n2\n00:00:01,000 --> ...'
|
||||
|
||||
:param subtitles: The subtitles to convert to SRT blocks
|
||||
:type subtitles: :term:`iterator` of :py:class:`Subtitle` objects
|
||||
:param bool reindex: Whether to reindex subtitles based on start time
|
||||
:param int start_index: If reindexing, the index to start reindexing from
|
||||
:param bool strict: Whether to enable strict mode, see
|
||||
:py:func:`Subtitle.to_srt` for more information
|
||||
:param str eol: The end of line string to use (default "\\n")
|
||||
:returns: A single SRT formatted string, with each input
|
||||
:py:class:`Subtitle` represented as an SRT block
|
||||
:param bool in_place: Whether to reindex subs in-place for performance
|
||||
(version <=1.0.0 behaviour)
|
||||
:rtype: str
|
||||
"""
|
||||
if reindex:
|
||||
subtitles = sort_and_reindex(
|
||||
subtitles, start_index=start_index, in_place=in_place
|
||||
)
|
||||
|
||||
return "".join(subtitle.to_srt(strict=strict, eol=eol) for subtitle in subtitles)
|
||||
|
||||
|
||||
class SRTParseError(Exception):
|
||||
"""
|
||||
Raised when part of an SRT block could not be parsed.
|
||||
|
||||
:param int expected_start: The expected contiguous start index
|
||||
:param int actual_start: The actual non-contiguous start index
|
||||
:param str unmatched_content: The content between the expected start index
|
||||
and the actual start index
|
||||
"""
|
||||
|
||||
def __init__(self, expected_start, actual_start, unmatched_content):
|
||||
message = (
|
||||
"Expected contiguous start of match or end of input at char %d, "
|
||||
"but started at char %d (unmatched content: %r)"
|
||||
% (expected_start, actual_start, unmatched_content)
|
||||
)
|
||||
super(SRTParseError, self).__init__(message)
|
||||
|
||||
self.expected_start = expected_start
|
||||
self.actual_start = actual_start
|
||||
self.unmatched_content = unmatched_content
|
||||
|
||||
|
||||
class TimestampParseError(ValueError):
|
||||
"""
|
||||
Raised when an SRT timestamp could not be parsed.
|
||||
"""
|
||||
|
||||
|
||||
class _ShouldSkipException(Exception):
|
||||
"""
|
||||
Raised when a subtitle should be skipped.
|
||||
"""
|
||||
Reference in New Issue
Block a user