Added support for multiple tags on a single model

This commit is contained in:
jeffser
2024-05-18 15:52:50 -06:00
parent 8ddce304b2
commit 02acbb2d70
571 changed files with 76910 additions and 127 deletions

View File

@@ -0,0 +1,340 @@
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from .constant import TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
class CharsetMatch:
def __init__(
self,
payload: bytes,
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None,
):
self._payload: bytes = payload
self._encoding: str = guessed_encoding
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
self._unicode_ranges: Optional[List[str]] = None
self._leaves: List[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
self._output_payload: Optional[bytes] = None
self._output_encoding: Optional[str] = None
self._string: Optional[str] = decoded_payload
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
raise TypeError(
"__eq__ cannot be invoked on {} and {}.".format(
str(other.__class__), str(self.__class__)
)
)
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
def __lt__(self, other: object) -> bool:
"""
Implemented to make sorted available upon CharsetMatches items.
"""
if not isinstance(other, CharsetMatch):
raise ValueError
chaos_difference: float = abs(self.chaos - other.chaos)
coherence_difference: float = abs(self.coherence - other.coherence)
# Below 1% difference --> Use Coherence
if chaos_difference < 0.01 and coherence_difference > 0.02:
return self.coherence > other.coherence
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
# preserve RAM usage!
if len(self._payload) >= TOO_BIG_SEQUENCE:
return self.chaos < other.chaos
return self.multi_byte_usage > other.multi_byte_usage
return self.chaos < other.chaos
@property
def multi_byte_usage(self) -> float:
return 1.0 - (len(str(self)) / len(self.raw))
def __str__(self) -> str:
# Lazy Str Loading
if self._string is None:
self._string = str(self._payload, self._encoding, "strict")
return self._string
def __repr__(self) -> str:
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
def add_submatch(self, other: "CharsetMatch") -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
other.__class__
)
)
other._string = None # Unload RAM usage; dirty trick.
self._leaves.append(other)
@property
def encoding(self) -> str:
return self._encoding
@property
def encoding_aliases(self) -> List[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
also_known_as: List[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
elif self.encoding == p:
also_known_as.append(u)
return also_known_as
@property
def bom(self) -> bool:
return self._has_sig_or_bom
@property
def byte_order_mark(self) -> bool:
return self._has_sig_or_bom
@property
def languages(self) -> List[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
"""
return [e[0] for e in self._languages]
@property
def language(self) -> str:
"""
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
"Unknown".
"""
if not self._languages:
# Trying to infer the language based on the given encoding
# Its either English or we should not pronounce ourselves in certain cases.
if "ascii" in self.could_be_from_charset:
return "English"
# doing it there to avoid circular import
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
languages = (
mb_encoding_languages(self.encoding)
if is_multi_byte_encoding(self.encoding)
else encoding_languages(self.encoding)
)
if len(languages) == 0 or "Latin Based" in languages:
return "Unknown"
return languages[0]
return self._languages[0][0]
@property
def chaos(self) -> float:
return self._mean_mess_ratio
@property
def coherence(self) -> float:
if not self._languages:
return 0.0
return self._languages[0][1]
@property
def percent_chaos(self) -> float:
return round(self.chaos * 100, ndigits=3)
@property
def percent_coherence(self) -> float:
return round(self.coherence * 100, ndigits=3)
@property
def raw(self) -> bytes:
"""
Original untouched bytes.
"""
return self._payload
@property
def submatch(self) -> List["CharsetMatch"]:
return self._leaves
@property
def has_submatch(self) -> bool:
return len(self._leaves) > 0
@property
def alphabets(self) -> List[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
detected_ranges: List[Optional[str]] = [
unicode_range(char) for char in str(self)
]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
def could_be_from_charset(self) -> List[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
This list does include the encoding available in property 'encoding'.
"""
return [self._encoding] + [m.encoding for m in self._leaves]
def output(self, encoding: str = "utf_8") -> bytes:
"""
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
Any errors will be simply ignored by the encoder NOT replaced.
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
self._output_payload = str(self).encode(encoding, "replace")
return self._output_payload # type: ignore
@property
def fingerprint(self) -> str:
"""
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
"""
return sha256(self.output()).hexdigest()
class CharsetMatches:
"""
Container with every CharsetMatch items ordered by default from most probable to the less one.
Act like a list(iterable) but does not implements all related methods.
"""
def __init__(self, results: Optional[List[CharsetMatch]] = None):
self._results: List[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
"""
if isinstance(item, int):
return self._results[item]
if isinstance(item, str):
item = iana_name(item, False)
for result in self._results:
if item in result.could_be_from_charset:
return result
raise KeyError
def __len__(self) -> int:
return len(self._results)
def __bool__(self) -> bool:
return len(self._results) > 0
def append(self, item: CharsetMatch) -> None:
"""
Insert a single match. Will be inserted accordingly to preserve sort.
Can be inserted as a submatch.
"""
if not isinstance(item, CharsetMatch):
raise ValueError(
"Cannot append instance '{}' to CharsetMatches".format(
str(item.__class__)
)
)
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
if len(item.raw) <= TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
return
self._results.append(item)
self._results = sorted(self._results)
def best(self) -> Optional["CharsetMatch"]:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
if not self._results:
return None
return self._results[0]
def first(self) -> Optional["CharsetMatch"]:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
return self.best()
CoherenceMatch = Tuple[str, float]
CoherenceMatches = List[CoherenceMatch]
class CliDetectionResult:
def __init__(
self,
path: str,
encoding: Optional[str],
encoding_aliases: List[str],
alternative_encodings: List[str],
language: str,
alphabets: List[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
unicode_path: Optional[str],
is_preferred: bool,
):
self.path: str = path
self.unicode_path: Optional[str] = unicode_path
self.encoding: Optional[str] = encoding
self.encoding_aliases: List[str] = encoding_aliases
self.alternative_encodings: List[str] = alternative_encodings
self.language: str = language
self.alphabets: List[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
def __dict__(self) -> Dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,
"encoding_aliases": self.encoding_aliases,
"alternative_encodings": self.alternative_encodings,
"language": self.language,
"alphabets": self.alphabets,
"has_sig_or_bom": self.has_sig_or_bom,
"chaos": self.chaos,
"coherence": self.coherence,
"unicode_path": self.unicode_path,
"is_preferred": self.is_preferred,
}
def to_json(self) -> str:
return dumps(self.__dict__, ensure_ascii=True, indent=4)

View File

@@ -0,0 +1,341 @@
from __future__ import annotations
import email.utils
import mimetypes
import typing
_TYPE_FIELD_VALUE = typing.Union[str, bytes]
_TYPE_FIELD_VALUE_TUPLE = typing.Union[
_TYPE_FIELD_VALUE,
typing.Tuple[str, _TYPE_FIELD_VALUE],
typing.Tuple[str, _TYPE_FIELD_VALUE, str],
]
def guess_content_type(
filename: str | None, default: str = "application/octet-stream"
) -> str:
"""
Guess the "Content-Type" of a file.
:param filename:
The filename to guess the "Content-Type" of using :mod:`mimetypes`.
:param default:
If no "Content-Type" can be guessed, default to `default`.
"""
if filename:
return mimetypes.guess_type(filename)[0] or default
return default
def format_header_param_rfc2231(name: str, value: _TYPE_FIELD_VALUE) -> str:
"""
Helper function to format and quote a single header parameter using the
strategy defined in RFC 2231.
Particularly useful for header parameters which might contain
non-ASCII values, like file names. This follows
`RFC 2388 Section 4.4 <https://tools.ietf.org/html/rfc2388#section-4.4>`_.
:param name:
The name of the parameter, a string expected to be ASCII only.
:param value:
The value of the parameter, provided as ``bytes`` or `str``.
:returns:
An RFC-2231-formatted unicode string.
.. deprecated:: 2.0.0
Will be removed in urllib3 v2.1.0. This is not valid for
``multipart/form-data`` header parameters.
"""
import warnings
warnings.warn(
"'format_header_param_rfc2231' is deprecated and will be "
"removed in urllib3 v2.1.0. This is not valid for "
"multipart/form-data header parameters.",
DeprecationWarning,
stacklevel=2,
)
if isinstance(value, bytes):
value = value.decode("utf-8")
if not any(ch in value for ch in '"\\\r\n'):
result = f'{name}="{value}"'
try:
result.encode("ascii")
except (UnicodeEncodeError, UnicodeDecodeError):
pass
else:
return result
value = email.utils.encode_rfc2231(value, "utf-8")
value = f"{name}*={value}"
return value
def format_multipart_header_param(name: str, value: _TYPE_FIELD_VALUE) -> str:
"""
Format and quote a single multipart header parameter.
This follows the `WHATWG HTML Standard`_ as of 2021/06/10, matching
the behavior of current browser and curl versions. Values are
assumed to be UTF-8. The ``\\n``, ``\\r``, and ``"`` characters are
percent encoded.
.. _WHATWG HTML Standard:
https://html.spec.whatwg.org/multipage/
form-control-infrastructure.html#multipart-form-data
:param name:
The name of the parameter, an ASCII-only ``str``.
:param value:
The value of the parameter, a ``str`` or UTF-8 encoded
``bytes``.
:returns:
A string ``name="value"`` with the escaped value.
.. versionchanged:: 2.0.0
Matches the WHATWG HTML Standard as of 2021/06/10. Control
characters are no longer percent encoded.
.. versionchanged:: 2.0.0
Renamed from ``format_header_param_html5`` and
``format_header_param``. The old names will be removed in
urllib3 v2.1.0.
"""
if isinstance(value, bytes):
value = value.decode("utf-8")
# percent encode \n \r "
value = value.translate({10: "%0A", 13: "%0D", 34: "%22"})
return f'{name}="{value}"'
def format_header_param_html5(name: str, value: _TYPE_FIELD_VALUE) -> str:
"""
.. deprecated:: 2.0.0
Renamed to :func:`format_multipart_header_param`. Will be
removed in urllib3 v2.1.0.
"""
import warnings
warnings.warn(
"'format_header_param_html5' has been renamed to "
"'format_multipart_header_param'. The old name will be "
"removed in urllib3 v2.1.0.",
DeprecationWarning,
stacklevel=2,
)
return format_multipart_header_param(name, value)
def format_header_param(name: str, value: _TYPE_FIELD_VALUE) -> str:
"""
.. deprecated:: 2.0.0
Renamed to :func:`format_multipart_header_param`. Will be
removed in urllib3 v2.1.0.
"""
import warnings
warnings.warn(
"'format_header_param' has been renamed to "
"'format_multipart_header_param'. The old name will be "
"removed in urllib3 v2.1.0.",
DeprecationWarning,
stacklevel=2,
)
return format_multipart_header_param(name, value)
class RequestField:
"""
A data container for request body parameters.
:param name:
The name of this request field. Must be unicode.
:param data:
The data/value body.
:param filename:
An optional filename of the request field. Must be unicode.
:param headers:
An optional dict-like object of headers to initially use for the field.
.. versionchanged:: 2.0.0
The ``header_formatter`` parameter is deprecated and will
be removed in urllib3 v2.1.0.
"""
def __init__(
self,
name: str,
data: _TYPE_FIELD_VALUE,
filename: str | None = None,
headers: typing.Mapping[str, str] | None = None,
header_formatter: typing.Callable[[str, _TYPE_FIELD_VALUE], str] | None = None,
):
self._name = name
self._filename = filename
self.data = data
self.headers: dict[str, str | None] = {}
if headers:
self.headers = dict(headers)
if header_formatter is not None:
import warnings
warnings.warn(
"The 'header_formatter' parameter is deprecated and "
"will be removed in urllib3 v2.1.0.",
DeprecationWarning,
stacklevel=2,
)
self.header_formatter = header_formatter
else:
self.header_formatter = format_multipart_header_param
@classmethod
def from_tuples(
cls,
fieldname: str,
value: _TYPE_FIELD_VALUE_TUPLE,
header_formatter: typing.Callable[[str, _TYPE_FIELD_VALUE], str] | None = None,
) -> RequestField:
"""
A :class:`~urllib3.fields.RequestField` factory from old-style tuple parameters.
Supports constructing :class:`~urllib3.fields.RequestField` from
parameter of key/value strings AND key/filetuple. A filetuple is a
(filename, data, MIME type) tuple where the MIME type is optional.
For example::
'foo': 'bar',
'fakefile': ('foofile.txt', 'contents of foofile'),
'realfile': ('barfile.txt', open('realfile').read()),
'typedfile': ('bazfile.bin', open('bazfile').read(), 'image/jpeg'),
'nonamefile': 'contents of nonamefile field',
Field names and filenames must be unicode.
"""
filename: str | None
content_type: str | None
data: _TYPE_FIELD_VALUE
if isinstance(value, tuple):
if len(value) == 3:
filename, data, content_type = value
else:
filename, data = value
content_type = guess_content_type(filename)
else:
filename = None
content_type = None
data = value
request_param = cls(
fieldname, data, filename=filename, header_formatter=header_formatter
)
request_param.make_multipart(content_type=content_type)
return request_param
def _render_part(self, name: str, value: _TYPE_FIELD_VALUE) -> str:
"""
Override this method to change how each multipart header
parameter is formatted. By default, this calls
:func:`format_multipart_header_param`.
:param name:
The name of the parameter, an ASCII-only ``str``.
:param value:
The value of the parameter, a ``str`` or UTF-8 encoded
``bytes``.
:meta public:
"""
return self.header_formatter(name, value)
def _render_parts(
self,
header_parts: (
dict[str, _TYPE_FIELD_VALUE | None]
| typing.Sequence[tuple[str, _TYPE_FIELD_VALUE | None]]
),
) -> str:
"""
Helper function to format and quote a single header.
Useful for single headers that are composed of multiple items. E.g.,
'Content-Disposition' fields.
:param header_parts:
A sequence of (k, v) tuples or a :class:`dict` of (k, v) to format
as `k1="v1"; k2="v2"; ...`.
"""
iterable: typing.Iterable[tuple[str, _TYPE_FIELD_VALUE | None]]
parts = []
if isinstance(header_parts, dict):
iterable = header_parts.items()
else:
iterable = header_parts
for name, value in iterable:
if value is not None:
parts.append(self._render_part(name, value))
return "; ".join(parts)
def render_headers(self) -> str:
"""
Renders the headers for this request field.
"""
lines = []
sort_keys = ["Content-Disposition", "Content-Type", "Content-Location"]
for sort_key in sort_keys:
if self.headers.get(sort_key, False):
lines.append(f"{sort_key}: {self.headers[sort_key]}")
for header_name, header_value in self.headers.items():
if header_name not in sort_keys:
if header_value:
lines.append(f"{header_name}: {header_value}")
lines.append("\r\n")
return "\r\n".join(lines)
def make_multipart(
self,
content_disposition: str | None = None,
content_type: str | None = None,
content_location: str | None = None,
) -> None:
"""
Makes this request field into a multipart request field.
This method overrides "Content-Disposition", "Content-Type" and
"Content-Location" headers to the request parameter.
:param content_disposition:
The 'Content-Disposition' of the request body. Defaults to 'form-data'
:param content_type:
The 'Content-Type' of the request body.
:param content_location:
The 'Content-Location' of the request body.
"""
content_disposition = (content_disposition or "form-data") + "; ".join(
[
"",
self._render_parts(
(("name", self._name), ("filename", self._filename))
),
]
)
self.headers["Content-Disposition"] = content_disposition
self.headers["Content-Type"] = content_type
self.headers["Content-Location"] = content_location

View File

@@ -0,0 +1,2 @@
__version__ = '3.7'