Added support for multiple tags on a single model

2024-05-18 15:52:50 -06:00
parent 8ddce304b2
commit 02acbb2d70
571 changed files with 76910 additions and 127 deletions
@@ -0,0 +1,340 @@
+from encodings.aliases import aliases
+from hashlib import sha256
+from json import dumps
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+
+from .constant import TOO_BIG_SEQUENCE
+from .utils import iana_name, is_multi_byte_encoding, unicode_range
+
+
+class CharsetMatch:
+    def __init__(
+        self,
+        payload: bytes,
+        guessed_encoding: str,
+        mean_mess_ratio: float,
+        has_sig_or_bom: bool,
+        languages: "CoherenceMatches",
+        decoded_payload: Optional[str] = None,
+    ):
+        self._payload: bytes = payload
+
+        self._encoding: str = guessed_encoding
+        self._mean_mess_ratio: float = mean_mess_ratio
+        self._languages: CoherenceMatches = languages
+        self._has_sig_or_bom: bool = has_sig_or_bom
+        self._unicode_ranges: Optional[List[str]] = None
+
+        self._leaves: List[CharsetMatch] = []
+        self._mean_coherence_ratio: float = 0.0
+
+        self._output_payload: Optional[bytes] = None
+        self._output_encoding: Optional[str] = None
+
+        self._string: Optional[str] = decoded_payload
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, CharsetMatch):
+            raise TypeError(
+                "__eq__ cannot be invoked on {} and {}.".format(
+                    str(other.__class__), str(self.__class__)
+                )
+            )
+        return self.encoding == other.encoding and self.fingerprint == other.fingerprint
+
+    def __lt__(self, other: object) -> bool:
+        """
+        Implemented to make sorted available upon CharsetMatches items.
+        """
+        if not isinstance(other, CharsetMatch):
+            raise ValueError
+
+        chaos_difference: float = abs(self.chaos - other.chaos)
+        coherence_difference: float = abs(self.coherence - other.coherence)
+
+        # Below 1% difference --> Use Coherence
+        if chaos_difference < 0.01 and coherence_difference > 0.02:
+            return self.coherence > other.coherence
+        elif chaos_difference < 0.01 and coherence_difference <= 0.02:
+            # When having a difficult decision, use the result that decoded as many multi-byte as possible.
+            # preserve RAM usage!
+            if len(self._payload) >= TOO_BIG_SEQUENCE:
+                return self.chaos < other.chaos
+            return self.multi_byte_usage > other.multi_byte_usage
+
+        return self.chaos < other.chaos
+
+    @property
+    def multi_byte_usage(self) -> float:
+        return 1.0 - (len(str(self)) / len(self.raw))
+
+    def __str__(self) -> str:
+        # Lazy Str Loading
+        if self._string is None:
+            self._string = str(self._payload, self._encoding, "strict")
+        return self._string
+
+    def __repr__(self) -> str:
+        return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
+
+    def add_submatch(self, other: "CharsetMatch") -> None:
+        if not isinstance(other, CharsetMatch) or other == self:
+            raise ValueError(
+                "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
+                    other.__class__
+                )
+            )
+
+        other._string = None  # Unload RAM usage; dirty trick.
+        self._leaves.append(other)
+
+    @property
+    def encoding(self) -> str:
+        return self._encoding
+
+    @property
+    def encoding_aliases(self) -> List[str]:
+        """
+        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
+        """
+        also_known_as: List[str] = []
+        for u, p in aliases.items():
+            if self.encoding == u:
+                also_known_as.append(p)
+            elif self.encoding == p:
+                also_known_as.append(u)
+        return also_known_as
+
+    @property
+    def bom(self) -> bool:
+        return self._has_sig_or_bom
+
+    @property
+    def byte_order_mark(self) -> bool:
+        return self._has_sig_or_bom
+
+    @property
+    def languages(self) -> List[str]:
+        """
+        Return the complete list of possible languages found in decoded sequence.
+        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
+        """
+        return [e[0] for e in self._languages]
+
+    @property
+    def language(self) -> str:
+        """
+        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
+        "Unknown".
+        """
+        if not self._languages:
+            # Trying to infer the language based on the given encoding
+            # Its either English or we should not pronounce ourselves in certain cases.
+            if "ascii" in self.could_be_from_charset:
+                return "English"
+
+            # doing it there to avoid circular import
+            from charset_normalizer.cd import encoding_languages, mb_encoding_languages
+
+            languages = (
+                mb_encoding_languages(self.encoding)
+                if is_multi_byte_encoding(self.encoding)
+                else encoding_languages(self.encoding)
+            )
+
+            if len(languages) == 0 or "Latin Based" in languages:
+                return "Unknown"
+
+            return languages[0]
+
+        return self._languages[0][0]
+
+    @property
+    def chaos(self) -> float:
+        return self._mean_mess_ratio
+
+    @property
+    def coherence(self) -> float:
+        if not self._languages:
+            return 0.0
+        return self._languages[0][1]
+
+    @property
+    def percent_chaos(self) -> float:
+        return round(self.chaos * 100, ndigits=3)
+
+    @property
+    def percent_coherence(self) -> float:
+        return round(self.coherence * 100, ndigits=3)
+
+    @property
+    def raw(self) -> bytes:
+        """
+        Original untouched bytes.
+        """
+        return self._payload
+
+    @property
+    def submatch(self) -> List["CharsetMatch"]:
+        return self._leaves
+
+    @property
+    def has_submatch(self) -> bool:
+        return len(self._leaves) > 0
+
+    @property
+    def alphabets(self) -> List[str]:
+        if self._unicode_ranges is not None:
+            return self._unicode_ranges
+        # list detected ranges
+        detected_ranges: List[Optional[str]] = [
+            unicode_range(char) for char in str(self)
+        ]
+        # filter and sort
+        self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
+        return self._unicode_ranges
+
+    @property
+    def could_be_from_charset(self) -> List[str]:
+        """
+        The complete list of encoding that output the exact SAME str result and therefore could be the originating
+        encoding.
+        This list does include the encoding available in property 'encoding'.
+        """
+        return [self._encoding] + [m.encoding for m in self._leaves]
+
+    def output(self, encoding: str = "utf_8") -> bytes:
+        """
+        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
+        Any errors will be simply ignored by the encoder NOT replaced.
+        """
+        if self._output_encoding is None or self._output_encoding != encoding:
+            self._output_encoding = encoding
+            self._output_payload = str(self).encode(encoding, "replace")
+
+        return self._output_payload  # type: ignore
+
+    @property
+    def fingerprint(self) -> str:
+        """
+        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
+        """
+        return sha256(self.output()).hexdigest()
+
+
+class CharsetMatches:
+    """
+    Container with every CharsetMatch items ordered by default from most probable to the less one.
+    Act like a list(iterable) but does not implements all related methods.
+    """
+
+    def __init__(self, results: Optional[List[CharsetMatch]] = None):
+        self._results: List[CharsetMatch] = sorted(results) if results else []
+
+    def __iter__(self) -> Iterator[CharsetMatch]:
+        yield from self._results
+
+    def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
+        """
+        Retrieve a single item either by its position or encoding name (alias may be used here).
+        Raise KeyError upon invalid index or encoding not present in results.
+        """
+        if isinstance(item, int):
+            return self._results[item]
+        if isinstance(item, str):
+            item = iana_name(item, False)
+            for result in self._results:
+                if item in result.could_be_from_charset:
+                    return result
+        raise KeyError
+
+    def __len__(self) -> int:
+        return len(self._results)
+
+    def __bool__(self) -> bool:
+        return len(self._results) > 0
+
+    def append(self, item: CharsetMatch) -> None:
+        """
+        Insert a single match. Will be inserted accordingly to preserve sort.
+        Can be inserted as a submatch.
+        """
+        if not isinstance(item, CharsetMatch):
+            raise ValueError(
+                "Cannot append instance '{}' to CharsetMatches".format(
+                    str(item.__class__)
+                )
+            )
+        # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
+        if len(item.raw) <= TOO_BIG_SEQUENCE:
+            for match in self._results:
+                if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
+                    match.add_submatch(item)
+                    return
+        self._results.append(item)
+        self._results = sorted(self._results)
+
+    def best(self) -> Optional["CharsetMatch"]:
+        """
+        Simply return the first match. Strict equivalent to matches[0].
+        """
+        if not self._results:
+            return None
+        return self._results[0]
+
+    def first(self) -> Optional["CharsetMatch"]:
+        """
+        Redundant method, call the method best(). Kept for BC reasons.
+        """
+        return self.best()
+
+
+CoherenceMatch = Tuple[str, float]
+CoherenceMatches = List[CoherenceMatch]
+
+
+class CliDetectionResult:
+    def __init__(
+        self,
+        path: str,
+        encoding: Optional[str],
+        encoding_aliases: List[str],
+        alternative_encodings: List[str],
+        language: str,
+        alphabets: List[str],
+        has_sig_or_bom: bool,
+        chaos: float,
+        coherence: float,
+        unicode_path: Optional[str],
+        is_preferred: bool,
+    ):
+        self.path: str = path
+        self.unicode_path: Optional[str] = unicode_path
+        self.encoding: Optional[str] = encoding
+        self.encoding_aliases: List[str] = encoding_aliases
+        self.alternative_encodings: List[str] = alternative_encodings
+        self.language: str = language
+        self.alphabets: List[str] = alphabets
+        self.has_sig_or_bom: bool = has_sig_or_bom
+        self.chaos: float = chaos
+        self.coherence: float = coherence
+        self.is_preferred: bool = is_preferred
+
+    @property
+    def __dict__(self) -> Dict[str, Any]:  # type: ignore
+        return {
+            "path": self.path,
+            "encoding": self.encoding,
+            "encoding_aliases": self.encoding_aliases,
+            "alternative_encodings": self.alternative_encodings,
+            "language": self.language,
+            "alphabets": self.alphabets,
+            "has_sig_or_bom": self.has_sig_or_bom,
+            "chaos": self.chaos,
+            "coherence": self.coherence,
+            "unicode_path": self.unicode_path,
+            "is_preferred": self.is_preferred,
+        }
+
+    def to_json(self) -> str:
+        return dumps(self.__dict__, ensure_ascii=True, indent=4)
@@ -0,0 +1,341 @@
+from __future__ import annotations
+
+import email.utils
+import mimetypes
+import typing
+
+_TYPE_FIELD_VALUE = typing.Union[str, bytes]
+_TYPE_FIELD_VALUE_TUPLE = typing.Union[
+    _TYPE_FIELD_VALUE,
+    typing.Tuple[str, _TYPE_FIELD_VALUE],
+    typing.Tuple[str, _TYPE_FIELD_VALUE, str],
+]
+
+
+def guess_content_type(
+    filename: str | None, default: str = "application/octet-stream"
+) -> str:
+    """
+    Guess the "Content-Type" of a file.
+
+    :param filename:
+        The filename to guess the "Content-Type" of using :mod:`mimetypes`.
+    :param default:
+        If no "Content-Type" can be guessed, default to `default`.
+    """
+    if filename:
+        return mimetypes.guess_type(filename)[0] or default
+    return default
+
+
+def format_header_param_rfc2231(name: str, value: _TYPE_FIELD_VALUE) -> str:
+    """
+    Helper function to format and quote a single header parameter using the
+    strategy defined in RFC 2231.
+
+    Particularly useful for header parameters which might contain
+    non-ASCII values, like file names. This follows
+    `RFC 2388 Section 4.4 <https://tools.ietf.org/html/rfc2388#section-4.4>`_.
+
+    :param name:
+        The name of the parameter, a string expected to be ASCII only.
+    :param value:
+        The value of the parameter, provided as ``bytes`` or `str``.
+    :returns:
+        An RFC-2231-formatted unicode string.
+
+    .. deprecated:: 2.0.0
+        Will be removed in urllib3 v2.1.0. This is not valid for
+        ``multipart/form-data`` header parameters.
+    """
+    import warnings
+
+    warnings.warn(
+        "'format_header_param_rfc2231' is deprecated and will be "
+        "removed in urllib3 v2.1.0. This is not valid for "
+        "multipart/form-data header parameters.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
+    if isinstance(value, bytes):
+        value = value.decode("utf-8")
+
+    if not any(ch in value for ch in '"\\\r\n'):
+        result = f'{name}="{value}"'
+        try:
+            result.encode("ascii")
+        except (UnicodeEncodeError, UnicodeDecodeError):
+            pass
+        else:
+            return result
+
+    value = email.utils.encode_rfc2231(value, "utf-8")
+    value = f"{name}*={value}"
+
+    return value
+
+
+def format_multipart_header_param(name: str, value: _TYPE_FIELD_VALUE) -> str:
+    """
+    Format and quote a single multipart header parameter.
+
+    This follows the `WHATWG HTML Standard`_ as of 2021/06/10, matching
+    the behavior of current browser and curl versions. Values are
+    assumed to be UTF-8. The ``\\n``, ``\\r``, and ``"`` characters are
+    percent encoded.
+
+    .. _WHATWG HTML Standard:
+        https://html.spec.whatwg.org/multipage/
+        form-control-infrastructure.html#multipart-form-data
+
+    :param name:
+        The name of the parameter, an ASCII-only ``str``.
+    :param value:
+        The value of the parameter, a ``str`` or UTF-8 encoded
+        ``bytes``.
+    :returns:
+        A string ``name="value"`` with the escaped value.
+
+    .. versionchanged:: 2.0.0
+        Matches the WHATWG HTML Standard as of 2021/06/10. Control
+        characters are no longer percent encoded.
+
+    .. versionchanged:: 2.0.0
+        Renamed from ``format_header_param_html5`` and
+        ``format_header_param``. The old names will be removed in
+        urllib3 v2.1.0.
+    """
+    if isinstance(value, bytes):
+        value = value.decode("utf-8")
+
+    # percent encode \n \r "
+    value = value.translate({10: "%0A", 13: "%0D", 34: "%22"})
+    return f'{name}="{value}"'
+
+
+def format_header_param_html5(name: str, value: _TYPE_FIELD_VALUE) -> str:
+    """
+    .. deprecated:: 2.0.0
+        Renamed to :func:`format_multipart_header_param`. Will be
+        removed in urllib3 v2.1.0.
+    """
+    import warnings
+
+    warnings.warn(
+        "'format_header_param_html5' has been renamed to "
+        "'format_multipart_header_param'. The old name will be "
+        "removed in urllib3 v2.1.0.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return format_multipart_header_param(name, value)
+
+
+def format_header_param(name: str, value: _TYPE_FIELD_VALUE) -> str:
+    """
+    .. deprecated:: 2.0.0
+        Renamed to :func:`format_multipart_header_param`. Will be
+        removed in urllib3 v2.1.0.
+    """
+    import warnings
+
+    warnings.warn(
+        "'format_header_param' has been renamed to "
+        "'format_multipart_header_param'. The old name will be "
+        "removed in urllib3 v2.1.0.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return format_multipart_header_param(name, value)
+
+
+class RequestField:
+    """
+    A data container for request body parameters.
+
+    :param name:
+        The name of this request field. Must be unicode.
+    :param data:
+        The data/value body.
+    :param filename:
+        An optional filename of the request field. Must be unicode.
+    :param headers:
+        An optional dict-like object of headers to initially use for the field.
+
+    .. versionchanged:: 2.0.0
+        The ``header_formatter`` parameter is deprecated and will
+        be removed in urllib3 v2.1.0.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        data: _TYPE_FIELD_VALUE,
+        filename: str | None = None,
+        headers: typing.Mapping[str, str] | None = None,
+        header_formatter: typing.Callable[[str, _TYPE_FIELD_VALUE], str] | None = None,
+    ):
+        self._name = name
+        self._filename = filename
+        self.data = data
+        self.headers: dict[str, str | None] = {}
+        if headers:
+            self.headers = dict(headers)
+
+        if header_formatter is not None:
+            import warnings
+
+            warnings.warn(
+                "The 'header_formatter' parameter is deprecated and "
+                "will be removed in urllib3 v2.1.0.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            self.header_formatter = header_formatter
+        else:
+            self.header_formatter = format_multipart_header_param
+
+    @classmethod
+    def from_tuples(
+        cls,
+        fieldname: str,
+        value: _TYPE_FIELD_VALUE_TUPLE,
+        header_formatter: typing.Callable[[str, _TYPE_FIELD_VALUE], str] | None = None,
+    ) -> RequestField:
+        """
+        A :class:`~urllib3.fields.RequestField` factory from old-style tuple parameters.
+
+        Supports constructing :class:`~urllib3.fields.RequestField` from
+        parameter of key/value strings AND key/filetuple. A filetuple is a
+        (filename, data, MIME type) tuple where the MIME type is optional.
+        For example::
+
+            'foo': 'bar',
+            'fakefile': ('foofile.txt', 'contents of foofile'),
+            'realfile': ('barfile.txt', open('realfile').read()),
+            'typedfile': ('bazfile.bin', open('bazfile').read(), 'image/jpeg'),
+            'nonamefile': 'contents of nonamefile field',
+
+        Field names and filenames must be unicode.
+        """
+        filename: str | None
+        content_type: str | None
+        data: _TYPE_FIELD_VALUE
+
+        if isinstance(value, tuple):
+            if len(value) == 3:
+                filename, data, content_type = value
+            else:
+                filename, data = value
+                content_type = guess_content_type(filename)
+        else:
+            filename = None
+            content_type = None
+            data = value
+
+        request_param = cls(
+            fieldname, data, filename=filename, header_formatter=header_formatter
+        )
+        request_param.make_multipart(content_type=content_type)
+
+        return request_param
+
+    def _render_part(self, name: str, value: _TYPE_FIELD_VALUE) -> str:
+        """
+        Override this method to change how each multipart header
+        parameter is formatted. By default, this calls
+        :func:`format_multipart_header_param`.
+
+        :param name:
+            The name of the parameter, an ASCII-only ``str``.
+        :param value:
+            The value of the parameter, a ``str`` or UTF-8 encoded
+            ``bytes``.
+
+        :meta public:
+        """
+        return self.header_formatter(name, value)
+
+    def _render_parts(
+        self,
+        header_parts: (
+            dict[str, _TYPE_FIELD_VALUE | None]
+            | typing.Sequence[tuple[str, _TYPE_FIELD_VALUE | None]]
+        ),
+    ) -> str:
+        """
+        Helper function to format and quote a single header.
+
+        Useful for single headers that are composed of multiple items. E.g.,
+        'Content-Disposition' fields.
+
+        :param header_parts:
+            A sequence of (k, v) tuples or a :class:`dict` of (k, v) to format
+            as `k1="v1"; k2="v2"; ...`.
+        """
+        iterable: typing.Iterable[tuple[str, _TYPE_FIELD_VALUE | None]]
+
+        parts = []
+        if isinstance(header_parts, dict):
+            iterable = header_parts.items()
+        else:
+            iterable = header_parts
+
+        for name, value in iterable:
+            if value is not None:
+                parts.append(self._render_part(name, value))
+
+        return "; ".join(parts)
+
+    def render_headers(self) -> str:
+        """
+        Renders the headers for this request field.
+        """
+        lines = []
+
+        sort_keys = ["Content-Disposition", "Content-Type", "Content-Location"]
+        for sort_key in sort_keys:
+            if self.headers.get(sort_key, False):
+                lines.append(f"{sort_key}: {self.headers[sort_key]}")
+
+        for header_name, header_value in self.headers.items():
+            if header_name not in sort_keys:
+                if header_value:
+                    lines.append(f"{header_name}: {header_value}")
+
+        lines.append("\r\n")
+        return "\r\n".join(lines)
+
+    def make_multipart(
+        self,
+        content_disposition: str | None = None,
+        content_type: str | None = None,
+        content_location: str | None = None,
+    ) -> None:
+        """
+        Makes this request field into a multipart request field.
+
+        This method overrides "Content-Disposition", "Content-Type" and
+        "Content-Location" headers to the request parameter.
+
+        :param content_disposition:
+            The 'Content-Disposition' of the request body. Defaults to 'form-data'
+        :param content_type:
+            The 'Content-Type' of the request body.
+        :param content_location:
+            The 'Content-Location' of the request body.
+
+        """
+        content_disposition = (content_disposition or "form-data") + "; ".join(
+            [
+                "",
+                self._render_parts(
+                    (("name", self._name), ("filename", self._filename))
+                ),
+            ]
+        )
+
+        self.headers["Content-Disposition"] = content_disposition
+        self.headers["Content-Type"] = content_type
+        self.headers["Content-Location"] = content_location
@@ -0,0 +1,2 @@
+__version__ = '3.7'
+