Added support for multiple tags on a single model
This commit is contained in:
@@ -0,0 +1,205 @@
|
||||
#
|
||||
# The Python Imaging Library.
|
||||
# $Id$
|
||||
#
|
||||
# a simple Qt image interface.
|
||||
#
|
||||
# history:
|
||||
# 2006-06-03 fl: created
|
||||
# 2006-06-04 fl: inherit from QImage instead of wrapping it
|
||||
# 2006-06-05 fl: removed toimage helper; move string support to ImageQt
|
||||
# 2013-11-13 fl: add support for Qt5 (aurelien.ballier@cyclonit.com)
|
||||
#
|
||||
# Copyright (c) 2006 by Secret Labs AB
|
||||
# Copyright (c) 2006 by Fredrik Lundh
|
||||
#
|
||||
# See the README file for information on usage and redistribution.
|
||||
#
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from io import BytesIO
|
||||
from typing import Callable
|
||||
|
||||
from . import Image
|
||||
from ._util import is_path
|
||||
|
||||
qt_version: str | None
|
||||
qt_versions = [
|
||||
["6", "PyQt6"],
|
||||
["side6", "PySide6"],
|
||||
]
|
||||
|
||||
# If a version has already been imported, attempt it first
|
||||
qt_versions.sort(key=lambda version: version[1] in sys.modules, reverse=True)
|
||||
for version, qt_module in qt_versions:
|
||||
try:
|
||||
QBuffer: type
|
||||
QIODevice: type
|
||||
QImage: type
|
||||
QPixmap: type
|
||||
qRgba: Callable[[int, int, int, int], int]
|
||||
if qt_module == "PyQt6":
|
||||
from PyQt6.QtCore import QBuffer, QIODevice
|
||||
from PyQt6.QtGui import QImage, QPixmap, qRgba
|
||||
elif qt_module == "PySide6":
|
||||
from PySide6.QtCore import QBuffer, QIODevice
|
||||
from PySide6.QtGui import QImage, QPixmap, qRgba
|
||||
except (ImportError, RuntimeError):
|
||||
continue
|
||||
qt_is_installed = True
|
||||
qt_version = version
|
||||
break
|
||||
else:
|
||||
qt_is_installed = False
|
||||
qt_version = None
|
||||
|
||||
|
||||
def rgb(r, g, b, a=255):
|
||||
"""(Internal) Turns an RGB color into a Qt compatible color integer."""
|
||||
# use qRgb to pack the colors, and then turn the resulting long
|
||||
# into a negative integer with the same bitpattern.
|
||||
return qRgba(r, g, b, a) & 0xFFFFFFFF
|
||||
|
||||
|
||||
def fromqimage(im):
|
||||
"""
|
||||
:param im: QImage or PIL ImageQt object
|
||||
"""
|
||||
buffer = QBuffer()
|
||||
if qt_version == "6":
|
||||
try:
|
||||
qt_openmode = QIODevice.OpenModeFlag
|
||||
except AttributeError:
|
||||
qt_openmode = QIODevice.OpenMode
|
||||
else:
|
||||
qt_openmode = QIODevice
|
||||
buffer.open(qt_openmode.ReadWrite)
|
||||
# preserve alpha channel with png
|
||||
# otherwise ppm is more friendly with Image.open
|
||||
if im.hasAlphaChannel():
|
||||
im.save(buffer, "png")
|
||||
else:
|
||||
im.save(buffer, "ppm")
|
||||
|
||||
b = BytesIO()
|
||||
b.write(buffer.data())
|
||||
buffer.close()
|
||||
b.seek(0)
|
||||
|
||||
return Image.open(b)
|
||||
|
||||
|
||||
def fromqpixmap(im):
|
||||
return fromqimage(im)
|
||||
|
||||
|
||||
def align8to32(bytes, width, mode):
|
||||
"""
|
||||
converts each scanline of data from 8 bit to 32 bit aligned
|
||||
"""
|
||||
|
||||
bits_per_pixel = {"1": 1, "L": 8, "P": 8, "I;16": 16}[mode]
|
||||
|
||||
# calculate bytes per line and the extra padding if needed
|
||||
bits_per_line = bits_per_pixel * width
|
||||
full_bytes_per_line, remaining_bits_per_line = divmod(bits_per_line, 8)
|
||||
bytes_per_line = full_bytes_per_line + (1 if remaining_bits_per_line else 0)
|
||||
|
||||
extra_padding = -bytes_per_line % 4
|
||||
|
||||
# already 32 bit aligned by luck
|
||||
if not extra_padding:
|
||||
return bytes
|
||||
|
||||
new_data = [
|
||||
bytes[i * bytes_per_line : (i + 1) * bytes_per_line] + b"\x00" * extra_padding
|
||||
for i in range(len(bytes) // bytes_per_line)
|
||||
]
|
||||
|
||||
return b"".join(new_data)
|
||||
|
||||
|
||||
def _toqclass_helper(im):
|
||||
data = None
|
||||
colortable = None
|
||||
exclusive_fp = False
|
||||
|
||||
# handle filename, if given instead of image name
|
||||
if hasattr(im, "toUtf8"):
|
||||
# FIXME - is this really the best way to do this?
|
||||
im = str(im.toUtf8(), "utf-8")
|
||||
if is_path(im):
|
||||
im = Image.open(im)
|
||||
exclusive_fp = True
|
||||
|
||||
qt_format = QImage.Format if qt_version == "6" else QImage
|
||||
if im.mode == "1":
|
||||
format = qt_format.Format_Mono
|
||||
elif im.mode == "L":
|
||||
format = qt_format.Format_Indexed8
|
||||
colortable = [rgb(i, i, i) for i in range(256)]
|
||||
elif im.mode == "P":
|
||||
format = qt_format.Format_Indexed8
|
||||
palette = im.getpalette()
|
||||
colortable = [rgb(*palette[i : i + 3]) for i in range(0, len(palette), 3)]
|
||||
elif im.mode == "RGB":
|
||||
# Populate the 4th channel with 255
|
||||
im = im.convert("RGBA")
|
||||
|
||||
data = im.tobytes("raw", "BGRA")
|
||||
format = qt_format.Format_RGB32
|
||||
elif im.mode == "RGBA":
|
||||
data = im.tobytes("raw", "BGRA")
|
||||
format = qt_format.Format_ARGB32
|
||||
elif im.mode == "I;16" and hasattr(qt_format, "Format_Grayscale16"): # Qt 5.13+
|
||||
im = im.point(lambda i: i * 256)
|
||||
|
||||
format = qt_format.Format_Grayscale16
|
||||
else:
|
||||
if exclusive_fp:
|
||||
im.close()
|
||||
msg = f"unsupported image mode {repr(im.mode)}"
|
||||
raise ValueError(msg)
|
||||
|
||||
size = im.size
|
||||
__data = data or align8to32(im.tobytes(), size[0], im.mode)
|
||||
if exclusive_fp:
|
||||
im.close()
|
||||
return {"data": __data, "size": size, "format": format, "colortable": colortable}
|
||||
|
||||
|
||||
if qt_is_installed:
|
||||
|
||||
class ImageQt(QImage):
|
||||
def __init__(self, im):
|
||||
"""
|
||||
An PIL image wrapper for Qt. This is a subclass of PyQt's QImage
|
||||
class.
|
||||
|
||||
:param im: A PIL Image object, or a file name (given either as
|
||||
Python string or a PyQt string object).
|
||||
"""
|
||||
im_data = _toqclass_helper(im)
|
||||
# must keep a reference, or Qt will crash!
|
||||
# All QImage constructors that take data operate on an existing
|
||||
# buffer, so this buffer has to hang on for the life of the image.
|
||||
# Fixes https://github.com/python-pillow/Pillow/issues/1370
|
||||
self.__data = im_data["data"]
|
||||
super().__init__(
|
||||
self.__data,
|
||||
im_data["size"][0],
|
||||
im_data["size"][1],
|
||||
im_data["format"],
|
||||
)
|
||||
if im_data["colortable"]:
|
||||
self.setColorTable(im_data["colortable"])
|
||||
|
||||
|
||||
def toqimage(im):
|
||||
return ImageQt(im)
|
||||
|
||||
|
||||
def toqpixmap(im):
|
||||
qimage = toqimage(im)
|
||||
return QPixmap.fromImage(qimage)
|
||||
@@ -0,0 +1,296 @@
|
||||
import argparse
|
||||
import sys
|
||||
from json import dumps
|
||||
from os.path import abspath, basename, dirname, join, realpath
|
||||
from platform import python_version
|
||||
from typing import List, Optional
|
||||
from unicodedata import unidata_version
|
||||
|
||||
import charset_normalizer.md as md_module
|
||||
from charset_normalizer import from_fp
|
||||
from charset_normalizer.models import CliDetectionResult
|
||||
from charset_normalizer.version import __version__
|
||||
|
||||
|
||||
def query_yes_no(question: str, default: str = "yes") -> bool:
|
||||
"""Ask a yes/no question via input() and return their answer.
|
||||
|
||||
"question" is a string that is presented to the user.
|
||||
"default" is the presumed answer if the user just hits <Enter>.
|
||||
It must be "yes" (the default), "no" or None (meaning
|
||||
an answer is required of the user).
|
||||
|
||||
The "answer" return value is True for "yes" or False for "no".
|
||||
|
||||
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
||||
"""
|
||||
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
||||
if default is None:
|
||||
prompt = " [y/n] "
|
||||
elif default == "yes":
|
||||
prompt = " [Y/n] "
|
||||
elif default == "no":
|
||||
prompt = " [y/N] "
|
||||
else:
|
||||
raise ValueError("invalid default answer: '%s'" % default)
|
||||
|
||||
while True:
|
||||
sys.stdout.write(question + prompt)
|
||||
choice = input().lower()
|
||||
if default is not None and choice == "":
|
||||
return valid[default]
|
||||
elif choice in valid:
|
||||
return valid[choice]
|
||||
else:
|
||||
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||
|
||||
|
||||
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||
"""
|
||||
CLI assistant using ARGV and ArgumentParser
|
||||
:param argv:
|
||||
:return: 0 if everything is fine, anything else equal trouble
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="The Real First Universal Charset Detector. "
|
||||
"Discover originating encoding used on text file. "
|
||||
"Normalize text to unicode."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="verbose",
|
||||
help="Display complementary information about file if any. "
|
||||
"Stdout will contain logs about the detection process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--with-alternative",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="alternatives",
|
||||
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--normalize",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="normalize",
|
||||
help="Permit to normalize input file. If not set, program does not write anything.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--minimal",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="minimal",
|
||||
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--replace",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="replace",
|
||||
help="Replace file when trying to normalize it instead of creating a new one.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--force",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="force",
|
||||
help="Replace file without asking if you are sure, use this flag with caution.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--threshold",
|
||||
action="store",
|
||||
default=0.2,
|
||||
type=float,
|
||||
dest="threshold",
|
||||
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
||||
__version__,
|
||||
python_version(),
|
||||
unidata_version,
|
||||
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
||||
),
|
||||
help="Show version information and exit.",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.replace is True and args.normalize is False:
|
||||
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.force is True and args.replace is False:
|
||||
print("Use --force in addition of --replace only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.threshold < 0.0 or args.threshold > 1.0:
|
||||
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
x_ = []
|
||||
|
||||
for my_file in args.files:
|
||||
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
|
||||
|
||||
best_guess = matches.best()
|
||||
|
||||
if best_guess is None:
|
||||
print(
|
||||
'Unable to identify originating encoding for "{}". {}'.format(
|
||||
my_file.name,
|
||||
"Maybe try increasing maximum amount of chaos."
|
||||
if args.threshold < 1.0
|
||||
else "",
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
None,
|
||||
[],
|
||||
[],
|
||||
"Unknown",
|
||||
[],
|
||||
False,
|
||||
1.0,
|
||||
0.0,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
else:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
best_guess.encoding,
|
||||
best_guess.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in best_guess.could_be_from_charset
|
||||
if cp != best_guess.encoding
|
||||
],
|
||||
best_guess.language,
|
||||
best_guess.alphabets,
|
||||
best_guess.bom,
|
||||
best_guess.percent_chaos,
|
||||
best_guess.percent_coherence,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
|
||||
if len(matches) > 1 and args.alternatives:
|
||||
for el in matches:
|
||||
if el != best_guess:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
el.encoding,
|
||||
el.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in el.could_be_from_charset
|
||||
if cp != el.encoding
|
||||
],
|
||||
el.language,
|
||||
el.alphabets,
|
||||
el.bom,
|
||||
el.percent_chaos,
|
||||
el.percent_coherence,
|
||||
None,
|
||||
False,
|
||||
)
|
||||
)
|
||||
|
||||
if args.normalize is True:
|
||||
if best_guess.encoding.startswith("utf") is True:
|
||||
print(
|
||||
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
||||
my_file.name
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
dir_path = dirname(realpath(my_file.name))
|
||||
file_name = basename(realpath(my_file.name))
|
||||
|
||||
o_: List[str] = file_name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
elif (
|
||||
args.force is False
|
||||
and query_yes_no(
|
||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||
my_file.name
|
||||
),
|
||||
"no",
|
||||
)
|
||||
is False
|
||||
):
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
try:
|
||||
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
||||
|
||||
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
||||
fp.write(str(best_guess))
|
||||
except IOError as e:
|
||||
print(str(e), file=sys.stderr)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
return 2
|
||||
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
|
||||
if args.minimal is False:
|
||||
print(
|
||||
dumps(
|
||||
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
||||
ensure_ascii=True,
|
||||
indent=4,
|
||||
)
|
||||
)
|
||||
else:
|
||||
for my_file in args.files:
|
||||
print(
|
||||
", ".join(
|
||||
[
|
||||
el.encoding or "undefined"
|
||||
for el in x_
|
||||
if el.path == abspath(my_file.name)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
||||
Reference in New Issue
Block a user