297 lines
9.5 KiB
Python
297 lines
9.5 KiB
Python
import argparse
|
|
import sys
|
|
from json import dumps
|
|
from os.path import abspath, basename, dirname, join, realpath
|
|
from platform import python_version
|
|
from typing import List, Optional
|
|
from unicodedata import unidata_version
|
|
|
|
import charset_normalizer.md as md_module
|
|
from charset_normalizer import from_fp
|
|
from charset_normalizer.models import CliDetectionResult
|
|
from charset_normalizer.version import __version__
|
|
|
|
|
|
def query_yes_no(question: str, default: str = "yes") -> bool:
|
|
"""Ask a yes/no question via input() and return their answer.
|
|
|
|
"question" is a string that is presented to the user.
|
|
"default" is the presumed answer if the user just hits <Enter>.
|
|
It must be "yes" (the default), "no" or None (meaning
|
|
an answer is required of the user).
|
|
|
|
The "answer" return value is True for "yes" or False for "no".
|
|
|
|
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
|
"""
|
|
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
|
if default is None:
|
|
prompt = " [y/n] "
|
|
elif default == "yes":
|
|
prompt = " [Y/n] "
|
|
elif default == "no":
|
|
prompt = " [y/N] "
|
|
else:
|
|
raise ValueError("invalid default answer: '%s'" % default)
|
|
|
|
while True:
|
|
sys.stdout.write(question + prompt)
|
|
choice = input().lower()
|
|
if default is not None and choice == "":
|
|
return valid[default]
|
|
elif choice in valid:
|
|
return valid[choice]
|
|
else:
|
|
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
|
|
|
|
|
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
|
"""
|
|
CLI assistant using ARGV and ArgumentParser
|
|
:param argv:
|
|
:return: 0 if everything is fine, anything else equal trouble
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="The Real First Universal Charset Detector. "
|
|
"Discover originating encoding used on text file. "
|
|
"Normalize text to unicode."
|
|
)
|
|
|
|
parser.add_argument(
|
|
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
|
)
|
|
parser.add_argument(
|
|
"-v",
|
|
"--verbose",
|
|
action="store_true",
|
|
default=False,
|
|
dest="verbose",
|
|
help="Display complementary information about file if any. "
|
|
"Stdout will contain logs about the detection process.",
|
|
)
|
|
parser.add_argument(
|
|
"-a",
|
|
"--with-alternative",
|
|
action="store_true",
|
|
default=False,
|
|
dest="alternatives",
|
|
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
|
)
|
|
parser.add_argument(
|
|
"-n",
|
|
"--normalize",
|
|
action="store_true",
|
|
default=False,
|
|
dest="normalize",
|
|
help="Permit to normalize input file. If not set, program does not write anything.",
|
|
)
|
|
parser.add_argument(
|
|
"-m",
|
|
"--minimal",
|
|
action="store_true",
|
|
default=False,
|
|
dest="minimal",
|
|
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
|
)
|
|
parser.add_argument(
|
|
"-r",
|
|
"--replace",
|
|
action="store_true",
|
|
default=False,
|
|
dest="replace",
|
|
help="Replace file when trying to normalize it instead of creating a new one.",
|
|
)
|
|
parser.add_argument(
|
|
"-f",
|
|
"--force",
|
|
action="store_true",
|
|
default=False,
|
|
dest="force",
|
|
help="Replace file without asking if you are sure, use this flag with caution.",
|
|
)
|
|
parser.add_argument(
|
|
"-t",
|
|
"--threshold",
|
|
action="store",
|
|
default=0.2,
|
|
type=float,
|
|
dest="threshold",
|
|
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
|
)
|
|
parser.add_argument(
|
|
"--version",
|
|
action="version",
|
|
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
|
__version__,
|
|
python_version(),
|
|
unidata_version,
|
|
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
|
),
|
|
help="Show version information and exit.",
|
|
)
|
|
|
|
args = parser.parse_args(argv)
|
|
|
|
if args.replace is True and args.normalize is False:
|
|
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
|
return 1
|
|
|
|
if args.force is True and args.replace is False:
|
|
print("Use --force in addition of --replace only.", file=sys.stderr)
|
|
return 1
|
|
|
|
if args.threshold < 0.0 or args.threshold > 1.0:
|
|
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
|
return 1
|
|
|
|
x_ = []
|
|
|
|
for my_file in args.files:
|
|
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
|
|
|
|
best_guess = matches.best()
|
|
|
|
if best_guess is None:
|
|
print(
|
|
'Unable to identify originating encoding for "{}". {}'.format(
|
|
my_file.name,
|
|
"Maybe try increasing maximum amount of chaos."
|
|
if args.threshold < 1.0
|
|
else "",
|
|
),
|
|
file=sys.stderr,
|
|
)
|
|
x_.append(
|
|
CliDetectionResult(
|
|
abspath(my_file.name),
|
|
None,
|
|
[],
|
|
[],
|
|
"Unknown",
|
|
[],
|
|
False,
|
|
1.0,
|
|
0.0,
|
|
None,
|
|
True,
|
|
)
|
|
)
|
|
else:
|
|
x_.append(
|
|
CliDetectionResult(
|
|
abspath(my_file.name),
|
|
best_guess.encoding,
|
|
best_guess.encoding_aliases,
|
|
[
|
|
cp
|
|
for cp in best_guess.could_be_from_charset
|
|
if cp != best_guess.encoding
|
|
],
|
|
best_guess.language,
|
|
best_guess.alphabets,
|
|
best_guess.bom,
|
|
best_guess.percent_chaos,
|
|
best_guess.percent_coherence,
|
|
None,
|
|
True,
|
|
)
|
|
)
|
|
|
|
if len(matches) > 1 and args.alternatives:
|
|
for el in matches:
|
|
if el != best_guess:
|
|
x_.append(
|
|
CliDetectionResult(
|
|
abspath(my_file.name),
|
|
el.encoding,
|
|
el.encoding_aliases,
|
|
[
|
|
cp
|
|
for cp in el.could_be_from_charset
|
|
if cp != el.encoding
|
|
],
|
|
el.language,
|
|
el.alphabets,
|
|
el.bom,
|
|
el.percent_chaos,
|
|
el.percent_coherence,
|
|
None,
|
|
False,
|
|
)
|
|
)
|
|
|
|
if args.normalize is True:
|
|
if best_guess.encoding.startswith("utf") is True:
|
|
print(
|
|
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
|
my_file.name
|
|
),
|
|
file=sys.stderr,
|
|
)
|
|
if my_file.closed is False:
|
|
my_file.close()
|
|
continue
|
|
|
|
dir_path = dirname(realpath(my_file.name))
|
|
file_name = basename(realpath(my_file.name))
|
|
|
|
o_: List[str] = file_name.split(".")
|
|
|
|
if args.replace is False:
|
|
o_.insert(-1, best_guess.encoding)
|
|
if my_file.closed is False:
|
|
my_file.close()
|
|
elif (
|
|
args.force is False
|
|
and query_yes_no(
|
|
'Are you sure to normalize "{}" by replacing it ?'.format(
|
|
my_file.name
|
|
),
|
|
"no",
|
|
)
|
|
is False
|
|
):
|
|
if my_file.closed is False:
|
|
my_file.close()
|
|
continue
|
|
|
|
try:
|
|
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
|
|
|
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
|
fp.write(str(best_guess))
|
|
except IOError as e:
|
|
print(str(e), file=sys.stderr)
|
|
if my_file.closed is False:
|
|
my_file.close()
|
|
return 2
|
|
|
|
if my_file.closed is False:
|
|
my_file.close()
|
|
|
|
if args.minimal is False:
|
|
print(
|
|
dumps(
|
|
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
|
ensure_ascii=True,
|
|
indent=4,
|
|
)
|
|
)
|
|
else:
|
|
for my_file in args.files:
|
|
print(
|
|
", ".join(
|
|
[
|
|
el.encoding or "undefined"
|
|
for el in x_
|
|
if el.path == abspath(my_file.name)
|
|
]
|
|
)
|
|
)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
cli_detect()
|