Added support for multiple tags on a single model

2024-05-18 15:52:50 -06:00
parent 8ddce304b2
commit 02acbb2d70
571 changed files with 76910 additions and 127 deletions
--- a/.flatpak-builder/cache/objects/43/5ace3b37539d48f8811969ca4ed03e04b7f5331cdf7e4da5e9352f924a00c4.file
+++ b/.flatpak-builder/cache/objects/43/5ace3b37539d48f8811969ca4ed03e04b7f5331cdf7e4da5e9352f924a00c4.file
@@ -0,0 +1,122 @@
+Metadata-Version: 2.1
+Name: requests
+Version: 2.31.0
+Summary: Python HTTP for Humans.
+Home-page: https://requests.readthedocs.io
+Author: Kenneth Reitz
+Author-email: me@kennethreitz.org
+License: Apache 2.0
+Project-URL: Documentation, https://requests.readthedocs.io
+Project-URL: Source, https://github.com/psf/requests
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Web Environment
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Software Development :: Libraries
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: charset-normalizer (<4,>=2)
+Requires-Dist: idna (<4,>=2.5)
+Requires-Dist: urllib3 (<3,>=1.21.1)
+Requires-Dist: certifi (>=2017.4.17)
+Provides-Extra: security
+Provides-Extra: socks
+Requires-Dist: PySocks (!=1.5.7,>=1.5.6) ; extra == 'socks'
+Provides-Extra: use_chardet_on_py3
+Requires-Dist: chardet (<6,>=3.0.2) ; extra == 'use_chardet_on_py3'
+
+# Requests
+
+**Requests** is a simple, yet elegant, HTTP library.
+
+```python
+>>> import requests
+>>> r = requests.get('https://httpbin.org/basic-auth/user/pass', auth=('user', 'pass'))
+>>> r.status_code
+200
+>>> r.headers['content-type']
+'application/json; charset=utf8'
+>>> r.encoding
+'utf-8'
+>>> r.text
+'{"authenticated": true, ...'
+>>> r.json()
+{'authenticated': True, ...}
+```
+
+Requests allows you to send HTTP/1.1 requests extremely easily. There’s no need to manually add query strings to your URLs, or to form-encode your `PUT` & `POST` data — but nowadays, just use the `json` method!
+
+Requests is one of the most downloaded Python packages today, pulling in around `30M downloads / week`— according to GitHub, Requests is currently [depended upon](https://github.com/psf/requests/network/dependents?package_id=UGFja2FnZS01NzA4OTExNg%3D%3D) by `1,000,000+` repositories. You may certainly put your trust in this code.
+
+[![Downloads](https://pepy.tech/badge/requests/month)](https://pepy.tech/project/requests)
+[![Supported Versions](https://img.shields.io/pypi/pyversions/requests.svg)](https://pypi.org/project/requests)
+[![Contributors](https://img.shields.io/github/contributors/psf/requests.svg)](https://github.com/psf/requests/graphs/contributors)
+
+## Installing Requests and Supported Versions
+
+Requests is available on PyPI:
+
+```console
+$ python -m pip install requests
+```
+
+Requests officially supports Python 3.7+.
+
+## Supported Features & Best–Practices
+
+Requests is ready for the demands of building robust and reliable HTTP–speaking applications, for the needs of today.
+
+- Keep-Alive & Connection Pooling
+- International Domains and URLs
+- Sessions with Cookie Persistence
+- Browser-style TLS/SSL Verification
+- Basic & Digest Authentication
+- Familiar `dict`–like Cookies
+- Automatic Content Decompression and Decoding
+- Multi-part File Uploads
+- SOCKS Proxy Support
+- Connection Timeouts
+- Streaming Downloads
+- Automatic honoring of `.netrc`
+- Chunked HTTP Requests
+
+## API Reference and User Guide available on [Read the Docs](https://requests.readthedocs.io)
+
+[![Read the Docs](https://raw.githubusercontent.com/psf/requests/main/ext/ss.png)](https://requests.readthedocs.io)
+
+## Cloning the repository
+
+When cloning the Requests repository, you may need to add the `-c
+fetch.fsck.badTimezone=ignore` flag to avoid an error about a bad commit (see
+[this issue](https://github.com/psf/requests/issues/2690) for more background):
+
+```shell
+git clone -c fetch.fsck.badTimezone=ignore https://github.com/psf/requests.git
+```
+
+You can also apply this setting to your global Git config:
+
+```shell
+git config --global fetch.fsck.badTimezone ignore
+```
+
+---
+
+[![Kenneth Reitz](https://raw.githubusercontent.com/psf/requests/main/ext/kr.png)](https://kennethreitz.org) [![Python Software Foundation](https://raw.githubusercontent.com/psf/requests/main/ext/psf.png)](https://www.python.org/psf)
+
+
--- a/.flatpak-builder/cache/objects/43/5d429de64946fa8c7bd78126aa0e59e587caae6c5ff4a75b03ca6d16a0571f.file
+++ b/.flatpak-builder/cache/objects/43/5d429de64946fa8c7bd78126aa0e59e587caae6c5ff4a75b03ca6d16a0571f.file
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+import typing
+
+from .url import Url
+
+if typing.TYPE_CHECKING:
+    from ..connection import ProxyConfig
+
+
+def connection_requires_http_tunnel(
+    proxy_url: Url | None = None,
+    proxy_config: ProxyConfig | None = None,
+    destination_scheme: str | None = None,
+) -> bool:
+    """
+    Returns True if the connection requires an HTTP CONNECT through the proxy.
+
+    :param URL proxy_url:
+        URL of the proxy.
+    :param ProxyConfig proxy_config:
+        Proxy configuration from poolmanager.py
+    :param str destination_scheme:
+        The scheme of the destination. (i.e https, http, etc)
+    """
+    # If we're not using a proxy, no way to use a tunnel.
+    if proxy_url is None:
+        return False
+
+    # HTTP destinations never require tunneling, we always forward.
+    if destination_scheme == "http":
+        return False
+
+    # Support for forwarding with HTTPS proxies and HTTPS destinations.
+    if (
+        proxy_url.scheme == "https"
+        and proxy_config
+        and proxy_config.use_forwarding_for_https
+    ):
+        return False
+
+    # Otherwise always use a tunnel.
+    return True
--- a/.flatpak-builder/cache/objects/43/af829f849a8e721df02231d2309271302f1292f20a9e346cf66899db8bb275.file
+++ b/.flatpak-builder/cache/objects/43/af829f849a8e721df02231d2309271302f1292f20a9e346cf66899db8bb275.file
@@ -0,0 +1,395 @@
+import importlib
+from codecs import IncrementalDecoder
+from collections import Counter
+from functools import lru_cache
+from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
+
+from .constant import (
+    FREQUENCIES,
+    KO_NAMES,
+    LANGUAGE_SUPPORTED_COUNT,
+    TOO_SMALL_SEQUENCE,
+    ZH_NAMES,
+)
+from .md import is_suspiciously_successive_range
+from .models import CoherenceMatches
+from .utils import (
+    is_accentuated,
+    is_latin,
+    is_multi_byte_encoding,
+    is_unicode_range_secondary,
+    unicode_range,
+)
+
+
+def encoding_unicode_range(iana_name: str) -> List[str]:
+    """
+    Return associated unicode ranges in a single byte code page.
+    """
+    if is_multi_byte_encoding(iana_name):
+        raise IOError("Function not supported on multi-byte code page")
+
+    decoder = importlib.import_module(
+        "encodings.{}".format(iana_name)
+    ).IncrementalDecoder
+
+    p: IncrementalDecoder = decoder(errors="ignore")
+    seen_ranges: Dict[str, int] = {}
+    character_count: int = 0
+
+    for i in range(0x40, 0xFF):
+        chunk: str = p.decode(bytes([i]))
+
+        if chunk:
+            character_range: Optional[str] = unicode_range(chunk)
+
+            if character_range is None:
+                continue
+
+            if is_unicode_range_secondary(character_range) is False:
+                if character_range not in seen_ranges:
+                    seen_ranges[character_range] = 0
+                seen_ranges[character_range] += 1
+                character_count += 1
+
+    return sorted(
+        [
+            character_range
+            for character_range in seen_ranges
+            if seen_ranges[character_range] / character_count >= 0.15
+        ]
+    )
+
+
+def unicode_range_languages(primary_range: str) -> List[str]:
+    """
+    Return inferred languages used with a unicode range.
+    """
+    languages: List[str] = []
+
+    for language, characters in FREQUENCIES.items():
+        for character in characters:
+            if unicode_range(character) == primary_range:
+                languages.append(language)
+                break
+
+    return languages
+
+
+@lru_cache()
+def encoding_languages(iana_name: str) -> List[str]:
+    """
+    Single-byte encoding language association. Some code page are heavily linked to particular language(s).
+    This function does the correspondence.
+    """
+    unicode_ranges: List[str] = encoding_unicode_range(iana_name)
+    primary_range: Optional[str] = None
+
+    for specified_range in unicode_ranges:
+        if "Latin" not in specified_range:
+            primary_range = specified_range
+            break
+
+    if primary_range is None:
+        return ["Latin Based"]
+
+    return unicode_range_languages(primary_range)
+
+
+@lru_cache()
+def mb_encoding_languages(iana_name: str) -> List[str]:
+    """
+    Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
+    This function does the correspondence.
+    """
+    if (
+        iana_name.startswith("shift_")
+        or iana_name.startswith("iso2022_jp")
+        or iana_name.startswith("euc_j")
+        or iana_name == "cp932"
+    ):
+        return ["Japanese"]
+    if iana_name.startswith("gb") or iana_name in ZH_NAMES:
+        return ["Chinese"]
+    if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
+        return ["Korean"]
+
+    return []
+
+
+@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
+def get_target_features(language: str) -> Tuple[bool, bool]:
+    """
+    Determine main aspects from a supported language if it contains accents and if is pure Latin.
+    """
+    target_have_accents: bool = False
+    target_pure_latin: bool = True
+
+    for character in FREQUENCIES[language]:
+        if not target_have_accents and is_accentuated(character):
+            target_have_accents = True
+        if target_pure_latin and is_latin(character) is False:
+            target_pure_latin = False
+
+    return target_have_accents, target_pure_latin
+
+
+def alphabet_languages(
+    characters: List[str], ignore_non_latin: bool = False
+) -> List[str]:
+    """
+    Return associated languages associated to given characters.
+    """
+    languages: List[Tuple[str, float]] = []
+
+    source_have_accents = any(is_accentuated(character) for character in characters)
+
+    for language, language_characters in FREQUENCIES.items():
+        target_have_accents, target_pure_latin = get_target_features(language)
+
+        if ignore_non_latin and target_pure_latin is False:
+            continue
+
+        if target_have_accents is False and source_have_accents:
+            continue
+
+        character_count: int = len(language_characters)
+
+        character_match_count: int = len(
+            [c for c in language_characters if c in characters]
+        )
+
+        ratio: float = character_match_count / character_count
+
+        if ratio >= 0.2:
+            languages.append((language, ratio))
+
+    languages = sorted(languages, key=lambda x: x[1], reverse=True)
+
+    return [compatible_language[0] for compatible_language in languages]
+
+
+def characters_popularity_compare(
+    language: str, ordered_characters: List[str]
+) -> float:
+    """
+    Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
+    The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
+    Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
+    """
+    if language not in FREQUENCIES:
+        raise ValueError("{} not available".format(language))
+
+    character_approved_count: int = 0
+    FREQUENCIES_language_set = set(FREQUENCIES[language])
+
+    ordered_characters_count: int = len(ordered_characters)
+    target_language_characters_count: int = len(FREQUENCIES[language])
+
+    large_alphabet: bool = target_language_characters_count > 26
+
+    for character, character_rank in zip(
+        ordered_characters, range(0, ordered_characters_count)
+    ):
+        if character not in FREQUENCIES_language_set:
+            continue
+
+        character_rank_in_language: int = FREQUENCIES[language].index(character)
+        expected_projection_ratio: float = (
+            target_language_characters_count / ordered_characters_count
+        )
+        character_rank_projection: int = int(character_rank * expected_projection_ratio)
+
+        if (
+            large_alphabet is False
+            and abs(character_rank_projection - character_rank_in_language) > 4
+        ):
+            continue
+
+        if (
+            large_alphabet is True
+            and abs(character_rank_projection - character_rank_in_language)
+            < target_language_characters_count / 3
+        ):
+            character_approved_count += 1
+            continue
+
+        characters_before_source: List[str] = FREQUENCIES[language][
+            0:character_rank_in_language
+        ]
+        characters_after_source: List[str] = FREQUENCIES[language][
+            character_rank_in_language:
+        ]
+        characters_before: List[str] = ordered_characters[0:character_rank]
+        characters_after: List[str] = ordered_characters[character_rank:]
+
+        before_match_count: int = len(
+            set(characters_before) & set(characters_before_source)
+        )
+
+        after_match_count: int = len(
+            set(characters_after) & set(characters_after_source)
+        )
+
+        if len(characters_before_source) == 0 and before_match_count <= 4:
+            character_approved_count += 1
+            continue
+
+        if len(characters_after_source) == 0 and after_match_count <= 4:
+            character_approved_count += 1
+            continue
+
+        if (
+            before_match_count / len(characters_before_source) >= 0.4
+            or after_match_count / len(characters_after_source) >= 0.4
+        ):
+            character_approved_count += 1
+            continue
+
+    return character_approved_count / len(ordered_characters)
+
+
+def alpha_unicode_split(decoded_sequence: str) -> List[str]:
+    """
+    Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
+    Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
+    One containing the latin letters and the other hebrew.
+    """
+    layers: Dict[str, str] = {}
+
+    for character in decoded_sequence:
+        if character.isalpha() is False:
+            continue
+
+        character_range: Optional[str] = unicode_range(character)
+
+        if character_range is None:
+            continue
+
+        layer_target_range: Optional[str] = None
+
+        for discovered_range in layers:
+            if (
+                is_suspiciously_successive_range(discovered_range, character_range)
+                is False
+            ):
+                layer_target_range = discovered_range
+                break
+
+        if layer_target_range is None:
+            layer_target_range = character_range
+
+        if layer_target_range not in layers:
+            layers[layer_target_range] = character.lower()
+            continue
+
+        layers[layer_target_range] += character.lower()
+
+    return list(layers.values())
+
+
+def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
+    """
+    This function merge results previously given by the function coherence_ratio.
+    The return type is the same as coherence_ratio.
+    """
+    per_language_ratios: Dict[str, List[float]] = {}
+    for result in results:
+        for sub_result in result:
+            language, ratio = sub_result
+            if language not in per_language_ratios:
+                per_language_ratios[language] = [ratio]
+                continue
+            per_language_ratios[language].append(ratio)
+
+    merge = [
+        (
+            language,
+            round(
+                sum(per_language_ratios[language]) / len(per_language_ratios[language]),
+                4,
+            ),
+        )
+        for language in per_language_ratios
+    ]
+
+    return sorted(merge, key=lambda x: x[1], reverse=True)
+
+
+def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
+    """
+    We shall NOT return "English—" in CoherenceMatches because it is an alternative
+    of "English". This function only keeps the best match and remove the em-dash in it.
+    """
+    index_results: Dict[str, List[float]] = dict()
+
+    for result in results:
+        language, ratio = result
+        no_em_name: str = language.replace("—", "")
+
+        if no_em_name not in index_results:
+            index_results[no_em_name] = []
+
+        index_results[no_em_name].append(ratio)
+
+    if any(len(index_results[e]) > 1 for e in index_results):
+        filtered_results: CoherenceMatches = []
+
+        for language in index_results:
+            filtered_results.append((language, max(index_results[language])))
+
+        return filtered_results
+
+    return results
+
+
+@lru_cache(maxsize=2048)
+def coherence_ratio(
+    decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
+) -> CoherenceMatches:
+    """
+    Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
+    A layer = Character extraction by alphabets/ranges.
+    """
+
+    results: List[Tuple[str, float]] = []
+    ignore_non_latin: bool = False
+
+    sufficient_match_count: int = 0
+
+    lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
+    if "Latin Based" in lg_inclusion_list:
+        ignore_non_latin = True
+        lg_inclusion_list.remove("Latin Based")
+
+    for layer in alpha_unicode_split(decoded_sequence):
+        sequence_frequencies: TypeCounter[str] = Counter(layer)
+        most_common = sequence_frequencies.most_common()
+
+        character_count: int = sum(o for c, o in most_common)
+
+        if character_count <= TOO_SMALL_SEQUENCE:
+            continue
+
+        popular_character_ordered: List[str] = [c for c, o in most_common]
+
+        for language in lg_inclusion_list or alphabet_languages(
+            popular_character_ordered, ignore_non_latin
+        ):
+            ratio: float = characters_popularity_compare(
+                language, popular_character_ordered
+            )
+
+            if ratio < threshold:
+                continue
+            elif ratio >= 0.8:
+                sufficient_match_count += 1
+
+            results.append((language, round(ratio, 4)))
+
+            if sufficient_match_count >= 3:
+                break
+
+    return sorted(
+        filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
+    )