Added support for multiple tags on a single model

This commit is contained in:
jeffser
2024-05-18 15:52:50 -06:00
parent 8ddce304b2
commit 02acbb2d70
571 changed files with 76910 additions and 127 deletions

View File

@@ -0,0 +1,205 @@
#
# The Python Imaging Library.
# $Id$
#
# a simple Qt image interface.
#
# history:
# 2006-06-03 fl: created
# 2006-06-04 fl: inherit from QImage instead of wrapping it
# 2006-06-05 fl: removed toimage helper; move string support to ImageQt
# 2013-11-13 fl: add support for Qt5 (aurelien.ballier@cyclonit.com)
#
# Copyright (c) 2006 by Secret Labs AB
# Copyright (c) 2006 by Fredrik Lundh
#
# See the README file for information on usage and redistribution.
#
from __future__ import annotations
import sys
from io import BytesIO
from typing import Callable
from . import Image
from ._util import is_path
qt_version: str | None
qt_versions = [
["6", "PyQt6"],
["side6", "PySide6"],
]
# If a version has already been imported, attempt it first
qt_versions.sort(key=lambda version: version[1] in sys.modules, reverse=True)
for version, qt_module in qt_versions:
try:
QBuffer: type
QIODevice: type
QImage: type
QPixmap: type
qRgba: Callable[[int, int, int, int], int]
if qt_module == "PyQt6":
from PyQt6.QtCore import QBuffer, QIODevice
from PyQt6.QtGui import QImage, QPixmap, qRgba
elif qt_module == "PySide6":
from PySide6.QtCore import QBuffer, QIODevice
from PySide6.QtGui import QImage, QPixmap, qRgba
except (ImportError, RuntimeError):
continue
qt_is_installed = True
qt_version = version
break
else:
qt_is_installed = False
qt_version = None
def rgb(r, g, b, a=255):
"""(Internal) Turns an RGB color into a Qt compatible color integer."""
# use qRgb to pack the colors, and then turn the resulting long
# into a negative integer with the same bitpattern.
return qRgba(r, g, b, a) & 0xFFFFFFFF
def fromqimage(im):
"""
:param im: QImage or PIL ImageQt object
"""
buffer = QBuffer()
if qt_version == "6":
try:
qt_openmode = QIODevice.OpenModeFlag
except AttributeError:
qt_openmode = QIODevice.OpenMode
else:
qt_openmode = QIODevice
buffer.open(qt_openmode.ReadWrite)
# preserve alpha channel with png
# otherwise ppm is more friendly with Image.open
if im.hasAlphaChannel():
im.save(buffer, "png")
else:
im.save(buffer, "ppm")
b = BytesIO()
b.write(buffer.data())
buffer.close()
b.seek(0)
return Image.open(b)
def fromqpixmap(im):
return fromqimage(im)
def align8to32(bytes, width, mode):
"""
converts each scanline of data from 8 bit to 32 bit aligned
"""
bits_per_pixel = {"1": 1, "L": 8, "P": 8, "I;16": 16}[mode]
# calculate bytes per line and the extra padding if needed
bits_per_line = bits_per_pixel * width
full_bytes_per_line, remaining_bits_per_line = divmod(bits_per_line, 8)
bytes_per_line = full_bytes_per_line + (1 if remaining_bits_per_line else 0)
extra_padding = -bytes_per_line % 4
# already 32 bit aligned by luck
if not extra_padding:
return bytes
new_data = [
bytes[i * bytes_per_line : (i + 1) * bytes_per_line] + b"\x00" * extra_padding
for i in range(len(bytes) // bytes_per_line)
]
return b"".join(new_data)
def _toqclass_helper(im):
data = None
colortable = None
exclusive_fp = False
# handle filename, if given instead of image name
if hasattr(im, "toUtf8"):
# FIXME - is this really the best way to do this?
im = str(im.toUtf8(), "utf-8")
if is_path(im):
im = Image.open(im)
exclusive_fp = True
qt_format = QImage.Format if qt_version == "6" else QImage
if im.mode == "1":
format = qt_format.Format_Mono
elif im.mode == "L":
format = qt_format.Format_Indexed8
colortable = [rgb(i, i, i) for i in range(256)]
elif im.mode == "P":
format = qt_format.Format_Indexed8
palette = im.getpalette()
colortable = [rgb(*palette[i : i + 3]) for i in range(0, len(palette), 3)]
elif im.mode == "RGB":
# Populate the 4th channel with 255
im = im.convert("RGBA")
data = im.tobytes("raw", "BGRA")
format = qt_format.Format_RGB32
elif im.mode == "RGBA":
data = im.tobytes("raw", "BGRA")
format = qt_format.Format_ARGB32
elif im.mode == "I;16" and hasattr(qt_format, "Format_Grayscale16"): # Qt 5.13+
im = im.point(lambda i: i * 256)
format = qt_format.Format_Grayscale16
else:
if exclusive_fp:
im.close()
msg = f"unsupported image mode {repr(im.mode)}"
raise ValueError(msg)
size = im.size
__data = data or align8to32(im.tobytes(), size[0], im.mode)
if exclusive_fp:
im.close()
return {"data": __data, "size": size, "format": format, "colortable": colortable}
if qt_is_installed:
class ImageQt(QImage):
def __init__(self, im):
"""
An PIL image wrapper for Qt. This is a subclass of PyQt's QImage
class.
:param im: A PIL Image object, or a file name (given either as
Python string or a PyQt string object).
"""
im_data = _toqclass_helper(im)
# must keep a reference, or Qt will crash!
# All QImage constructors that take data operate on an existing
# buffer, so this buffer has to hang on for the life of the image.
# Fixes https://github.com/python-pillow/Pillow/issues/1370
self.__data = im_data["data"]
super().__init__(
self.__data,
im_data["size"][0],
im_data["size"][1],
im_data["format"],
)
if im_data["colortable"]:
self.setColorTable(im_data["colortable"])
def toqimage(im):
return ImageQt(im)
def toqpixmap(im):
qimage = toqimage(im)
return QPixmap.fromImage(qimage)

View File

@@ -0,0 +1,296 @@
import argparse
import sys
from json import dumps
from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
from typing import List, Optional
from unicodedata import unidata_version
import charset_normalizer.md as md_module
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
def query_yes_no(question: str, default: str = "yes") -> bool:
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
"""
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == "":
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
def cli_detect(argv: Optional[List[str]] = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
:return: 0 if everything is fine, anything else equal trouble
"""
parser = argparse.ArgumentParser(
description="The Real First Universal Charset Detector. "
"Discover originating encoding used on text file. "
"Normalize text to unicode."
)
parser.add_argument(
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
dest="verbose",
help="Display complementary information about file if any. "
"Stdout will contain logs about the detection process.",
)
parser.add_argument(
"-a",
"--with-alternative",
action="store_true",
default=False,
dest="alternatives",
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
)
parser.add_argument(
"-n",
"--normalize",
action="store_true",
default=False,
dest="normalize",
help="Permit to normalize input file. If not set, program does not write anything.",
)
parser.add_argument(
"-m",
"--minimal",
action="store_true",
default=False,
dest="minimal",
help="Only output the charset detected to STDOUT. Disabling JSON output.",
)
parser.add_argument(
"-r",
"--replace",
action="store_true",
default=False,
dest="replace",
help="Replace file when trying to normalize it instead of creating a new one.",
)
parser.add_argument(
"-f",
"--force",
action="store_true",
default=False,
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-t",
"--threshold",
action="store",
default=0.2,
type=float,
dest="threshold",
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
)
parser.add_argument(
"--version",
action="version",
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
__version__,
python_version(),
unidata_version,
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
),
help="Show version information and exit.",
)
args = parser.parse_args(argv)
if args.replace is True and args.normalize is False:
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1
if args.force is True and args.replace is False:
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1
if args.threshold < 0.0 or args.threshold > 1.0:
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1
x_ = []
for my_file in args.files:
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
best_guess = matches.best()
if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else "",
),
file=sys.stderr,
)
x_.append(
CliDetectionResult(
abspath(my_file.name),
None,
[],
[],
"Unknown",
[],
False,
1.0,
0.0,
None,
True,
)
)
else:
x_.append(
CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
)
if len(matches) > 1 and args.alternatives:
for el in matches:
if el != best_guess:
x_.append(
CliDetectionResult(
abspath(my_file.name),
el.encoding,
el.encoding_aliases,
[
cp
for cp in el.could_be_from_charset
if cp != el.encoding
],
el.language,
el.alphabets,
el.bom,
el.percent_chaos,
el.percent_coherence,
None,
False,
)
)
if args.normalize is True:
if best_guess.encoding.startswith("utf") is True:
print(
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
my_file.name
),
file=sys.stderr,
)
if my_file.closed is False:
my_file.close()
continue
dir_path = dirname(realpath(my_file.name))
file_name = basename(realpath(my_file.name))
o_: List[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
elif (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
fp.write(str(best_guess))
except IOError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
return 2
if my_file.closed is False:
my_file.close()
if args.minimal is False:
print(
dumps(
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
ensure_ascii=True,
indent=4,
)
)
else:
for my_file in args.files:
print(
", ".join(
[
el.encoding or "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
)
)
return 0
if __name__ == "__main__":
cli_detect()