Change venv
This commit is contained in:
@@ -58,13 +58,11 @@ if DEBUNDLED:
|
||||
sys.path[:] = glob.glob(os.path.join(WHEEL_DIR, "*.whl")) + sys.path
|
||||
|
||||
# Actually alias all of our vendored dependencies.
|
||||
vendored("appdirs")
|
||||
vendored("cachecontrol")
|
||||
vendored("certifi")
|
||||
vendored("colorama")
|
||||
vendored("distlib")
|
||||
vendored("distro")
|
||||
vendored("html5lib")
|
||||
vendored("six")
|
||||
vendored("six.moves")
|
||||
vendored("six.moves.urllib")
|
||||
@@ -74,6 +72,7 @@ if DEBUNDLED:
|
||||
vendored("packaging.specifiers")
|
||||
vendored("pep517")
|
||||
vendored("pkg_resources")
|
||||
vendored("platformdirs")
|
||||
vendored("progress")
|
||||
vendored("requests")
|
||||
vendored("requests.exceptions")
|
||||
@@ -106,6 +105,16 @@ if DEBUNDLED:
|
||||
vendored("requests.packages.urllib3.util.timeout")
|
||||
vendored("requests.packages.urllib3.util.url")
|
||||
vendored("resolvelib")
|
||||
vendored("rich")
|
||||
vendored("rich.console")
|
||||
vendored("rich.highlighter")
|
||||
vendored("rich.logging")
|
||||
vendored("rich.markup")
|
||||
vendored("rich.progress")
|
||||
vendored("rich.segment")
|
||||
vendored("rich.style")
|
||||
vendored("rich.text")
|
||||
vendored("rich.traceback")
|
||||
vendored("tenacity")
|
||||
vendored("tomli")
|
||||
vendored("urllib3")
|
||||
|
||||
@@ -1,633 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (c) 2005-2010 ActiveState Software Inc.
|
||||
# Copyright (c) 2013 Eddy Petrișor
|
||||
|
||||
"""Utilities for determining application-specific dirs.
|
||||
|
||||
See <http://github.com/ActiveState/appdirs> for details and usage.
|
||||
"""
|
||||
# Dev Notes:
|
||||
# - MSDN on where to store app data files:
|
||||
# http://support.microsoft.com/default.aspx?scid=kb;en-us;310294#XSLTH3194121123120121120120
|
||||
# - Mac OS X: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/index.html
|
||||
# - XDG spec for Un*x: http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
|
||||
|
||||
__version__ = "1.4.4"
|
||||
__version_info__ = tuple(int(segment) for segment in __version__.split("."))
|
||||
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
PY3 = sys.version_info[0] == 3
|
||||
|
||||
if PY3:
|
||||
unicode = str
|
||||
|
||||
if sys.platform.startswith('java'):
|
||||
import platform
|
||||
os_name = platform.java_ver()[3][0]
|
||||
if os_name.startswith('Windows'): # "Windows XP", "Windows 7", etc.
|
||||
system = 'win32'
|
||||
elif os_name.startswith('Mac'): # "Mac OS X", etc.
|
||||
system = 'darwin'
|
||||
else: # "Linux", "SunOS", "FreeBSD", etc.
|
||||
# Setting this to "linux2" is not ideal, but only Windows or Mac
|
||||
# are actually checked for and the rest of the module expects
|
||||
# *sys.platform* style strings.
|
||||
system = 'linux2'
|
||||
elif sys.platform == 'cli' and os.name == 'nt':
|
||||
# Detect Windows in IronPython to match pip._internal.utils.compat.WINDOWS
|
||||
# Discussion: <https://github.com/pypa/pip/pull/7501>
|
||||
system = 'win32'
|
||||
else:
|
||||
system = sys.platform
|
||||
|
||||
|
||||
|
||||
def user_data_dir(appname=None, appauthor=None, version=None, roaming=False):
|
||||
r"""Return full path to the user-specific data dir for this application.
|
||||
|
||||
"appname" is the name of application.
|
||||
If None, just the system directory is returned.
|
||||
"appauthor" (only used on Windows) is the name of the
|
||||
appauthor or distributing body for this application. Typically
|
||||
it is the owning company name. This falls back to appname. You may
|
||||
pass False to disable it.
|
||||
"version" is an optional version path element to append to the
|
||||
path. You might want to use this if you want multiple versions
|
||||
of your app to be able to run independently. If used, this
|
||||
would typically be "<major>.<minor>".
|
||||
Only applied when appname is present.
|
||||
"roaming" (boolean, default False) can be set True to use the Windows
|
||||
roaming appdata directory. That means that for users on a Windows
|
||||
network setup for roaming profiles, this user data will be
|
||||
sync'd on login. See
|
||||
<http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
|
||||
for a discussion of issues.
|
||||
|
||||
Typical user data directories are:
|
||||
Mac OS X: ~/Library/Application Support/<AppName> # or ~/.config/<AppName>, if the other does not exist
|
||||
Unix: ~/.local/share/<AppName> # or in $XDG_DATA_HOME, if defined
|
||||
Win XP (not roaming): C:\Documents and Settings\<username>\Application Data\<AppAuthor>\<AppName>
|
||||
Win XP (roaming): C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>
|
||||
Win 7 (not roaming): C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>
|
||||
Win 7 (roaming): C:\Users\<username>\AppData\Roaming\<AppAuthor>\<AppName>
|
||||
|
||||
For Unix, we follow the XDG spec and support $XDG_DATA_HOME.
|
||||
That means, by default "~/.local/share/<AppName>".
|
||||
"""
|
||||
if system == "win32":
|
||||
if appauthor is None:
|
||||
appauthor = appname
|
||||
const = roaming and "CSIDL_APPDATA" or "CSIDL_LOCAL_APPDATA"
|
||||
path = os.path.normpath(_get_win_folder(const))
|
||||
if appname:
|
||||
if appauthor is not False:
|
||||
path = os.path.join(path, appauthor, appname)
|
||||
else:
|
||||
path = os.path.join(path, appname)
|
||||
elif system == 'darwin':
|
||||
path = os.path.expanduser('~/Library/Application Support/')
|
||||
if appname:
|
||||
path = os.path.join(path, appname)
|
||||
else:
|
||||
path = os.getenv('XDG_DATA_HOME', os.path.expanduser("~/.local/share"))
|
||||
if appname:
|
||||
path = os.path.join(path, appname)
|
||||
if appname and version:
|
||||
path = os.path.join(path, version)
|
||||
return path
|
||||
|
||||
|
||||
def site_data_dir(appname=None, appauthor=None, version=None, multipath=False):
|
||||
r"""Return full path to the user-shared data dir for this application.
|
||||
|
||||
"appname" is the name of application.
|
||||
If None, just the system directory is returned.
|
||||
"appauthor" (only used on Windows) is the name of the
|
||||
appauthor or distributing body for this application. Typically
|
||||
it is the owning company name. This falls back to appname. You may
|
||||
pass False to disable it.
|
||||
"version" is an optional version path element to append to the
|
||||
path. You might want to use this if you want multiple versions
|
||||
of your app to be able to run independently. If used, this
|
||||
would typically be "<major>.<minor>".
|
||||
Only applied when appname is present.
|
||||
"multipath" is an optional parameter only applicable to *nix
|
||||
which indicates that the entire list of data dirs should be
|
||||
returned. By default, the first item from XDG_DATA_DIRS is
|
||||
returned, or '/usr/local/share/<AppName>',
|
||||
if XDG_DATA_DIRS is not set
|
||||
|
||||
Typical site data directories are:
|
||||
Mac OS X: /Library/Application Support/<AppName>
|
||||
Unix: /usr/local/share/<AppName> or /usr/share/<AppName>
|
||||
Win XP: C:\Documents and Settings\All Users\Application Data\<AppAuthor>\<AppName>
|
||||
Vista: (Fail! "C:\ProgramData" is a hidden *system* directory on Vista.)
|
||||
Win 7: C:\ProgramData\<AppAuthor>\<AppName> # Hidden, but writeable on Win 7.
|
||||
|
||||
For Unix, this is using the $XDG_DATA_DIRS[0] default.
|
||||
|
||||
WARNING: Do not use this on Windows. See the Vista-Fail note above for why.
|
||||
"""
|
||||
if system == "win32":
|
||||
if appauthor is None:
|
||||
appauthor = appname
|
||||
path = os.path.normpath(_get_win_folder("CSIDL_COMMON_APPDATA"))
|
||||
if appname:
|
||||
if appauthor is not False:
|
||||
path = os.path.join(path, appauthor, appname)
|
||||
else:
|
||||
path = os.path.join(path, appname)
|
||||
elif system == 'darwin':
|
||||
path = os.path.expanduser('/Library/Application Support')
|
||||
if appname:
|
||||
path = os.path.join(path, appname)
|
||||
else:
|
||||
# XDG default for $XDG_DATA_DIRS
|
||||
# only first, if multipath is False
|
||||
path = os.getenv('XDG_DATA_DIRS',
|
||||
os.pathsep.join(['/usr/local/share', '/usr/share']))
|
||||
pathlist = [os.path.expanduser(x.rstrip(os.sep)) for x in path.split(os.pathsep)]
|
||||
if appname:
|
||||
if version:
|
||||
appname = os.path.join(appname, version)
|
||||
pathlist = [os.path.join(x, appname) for x in pathlist]
|
||||
|
||||
if multipath:
|
||||
path = os.pathsep.join(pathlist)
|
||||
else:
|
||||
path = pathlist[0]
|
||||
return path
|
||||
|
||||
if appname and version:
|
||||
path = os.path.join(path, version)
|
||||
return path
|
||||
|
||||
|
||||
def user_config_dir(appname=None, appauthor=None, version=None, roaming=False):
|
||||
r"""Return full path to the user-specific config dir for this application.
|
||||
|
||||
"appname" is the name of application.
|
||||
If None, just the system directory is returned.
|
||||
"appauthor" (only used on Windows) is the name of the
|
||||
appauthor or distributing body for this application. Typically
|
||||
it is the owning company name. This falls back to appname. You may
|
||||
pass False to disable it.
|
||||
"version" is an optional version path element to append to the
|
||||
path. You might want to use this if you want multiple versions
|
||||
of your app to be able to run independently. If used, this
|
||||
would typically be "<major>.<minor>".
|
||||
Only applied when appname is present.
|
||||
"roaming" (boolean, default False) can be set True to use the Windows
|
||||
roaming appdata directory. That means that for users on a Windows
|
||||
network setup for roaming profiles, this user data will be
|
||||
sync'd on login. See
|
||||
<http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
|
||||
for a discussion of issues.
|
||||
|
||||
Typical user config directories are:
|
||||
Mac OS X: same as user_data_dir
|
||||
Unix: ~/.config/<AppName> # or in $XDG_CONFIG_HOME, if defined
|
||||
Win *: same as user_data_dir
|
||||
|
||||
For Unix, we follow the XDG spec and support $XDG_CONFIG_HOME.
|
||||
That means, by default "~/.config/<AppName>".
|
||||
"""
|
||||
if system in ["win32", "darwin"]:
|
||||
path = user_data_dir(appname, appauthor, None, roaming)
|
||||
else:
|
||||
path = os.getenv('XDG_CONFIG_HOME', os.path.expanduser("~/.config"))
|
||||
if appname:
|
||||
path = os.path.join(path, appname)
|
||||
if appname and version:
|
||||
path = os.path.join(path, version)
|
||||
return path
|
||||
|
||||
|
||||
# for the discussion regarding site_config_dir locations
|
||||
# see <https://github.com/pypa/pip/issues/1733>
|
||||
def site_config_dir(appname=None, appauthor=None, version=None, multipath=False):
|
||||
r"""Return full path to the user-shared data dir for this application.
|
||||
|
||||
"appname" is the name of application.
|
||||
If None, just the system directory is returned.
|
||||
"appauthor" (only used on Windows) is the name of the
|
||||
appauthor or distributing body for this application. Typically
|
||||
it is the owning company name. This falls back to appname. You may
|
||||
pass False to disable it.
|
||||
"version" is an optional version path element to append to the
|
||||
path. You might want to use this if you want multiple versions
|
||||
of your app to be able to run independently. If used, this
|
||||
would typically be "<major>.<minor>".
|
||||
Only applied when appname is present.
|
||||
"multipath" is an optional parameter only applicable to *nix
|
||||
which indicates that the entire list of config dirs should be
|
||||
returned. By default, the first item from XDG_CONFIG_DIRS is
|
||||
returned, or '/etc/xdg/<AppName>', if XDG_CONFIG_DIRS is not set
|
||||
|
||||
Typical site config directories are:
|
||||
Mac OS X: same as site_data_dir
|
||||
Unix: /etc/xdg/<AppName> or $XDG_CONFIG_DIRS[i]/<AppName> for each value in
|
||||
$XDG_CONFIG_DIRS
|
||||
Win *: same as site_data_dir
|
||||
Vista: (Fail! "C:\ProgramData" is a hidden *system* directory on Vista.)
|
||||
|
||||
For Unix, this is using the $XDG_CONFIG_DIRS[0] default, if multipath=False
|
||||
|
||||
WARNING: Do not use this on Windows. See the Vista-Fail note above for why.
|
||||
"""
|
||||
if system in ["win32", "darwin"]:
|
||||
path = site_data_dir(appname, appauthor)
|
||||
if appname and version:
|
||||
path = os.path.join(path, version)
|
||||
else:
|
||||
# XDG default for $XDG_CONFIG_DIRS (missing or empty)
|
||||
# see <https://github.com/pypa/pip/pull/7501#discussion_r360624829>
|
||||
# only first, if multipath is False
|
||||
path = os.getenv('XDG_CONFIG_DIRS') or '/etc/xdg'
|
||||
pathlist = [os.path.expanduser(x.rstrip(os.sep)) for x in path.split(os.pathsep) if x]
|
||||
if appname:
|
||||
if version:
|
||||
appname = os.path.join(appname, version)
|
||||
pathlist = [os.path.join(x, appname) for x in pathlist]
|
||||
|
||||
if multipath:
|
||||
path = os.pathsep.join(pathlist)
|
||||
else:
|
||||
path = pathlist[0]
|
||||
return path
|
||||
|
||||
|
||||
def user_cache_dir(appname=None, appauthor=None, version=None, opinion=True):
|
||||
r"""Return full path to the user-specific cache dir for this application.
|
||||
|
||||
"appname" is the name of application.
|
||||
If None, just the system directory is returned.
|
||||
"appauthor" (only used on Windows) is the name of the
|
||||
appauthor or distributing body for this application. Typically
|
||||
it is the owning company name. This falls back to appname. You may
|
||||
pass False to disable it.
|
||||
"version" is an optional version path element to append to the
|
||||
path. You might want to use this if you want multiple versions
|
||||
of your app to be able to run independently. If used, this
|
||||
would typically be "<major>.<minor>".
|
||||
Only applied when appname is present.
|
||||
"opinion" (boolean) can be False to disable the appending of
|
||||
"Cache" to the base app data dir for Windows. See
|
||||
discussion below.
|
||||
|
||||
Typical user cache directories are:
|
||||
Mac OS X: ~/Library/Caches/<AppName>
|
||||
Unix: ~/.cache/<AppName> (XDG default)
|
||||
Win XP: C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>\Cache
|
||||
Vista: C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Cache
|
||||
|
||||
On Windows the only suggestion in the MSDN docs is that local settings go in
|
||||
the `CSIDL_LOCAL_APPDATA` directory. This is identical to the non-roaming
|
||||
app data dir (the default returned by `user_data_dir` above). Apps typically
|
||||
put cache data somewhere *under* the given dir here. Some examples:
|
||||
...\Mozilla\Firefox\Profiles\<ProfileName>\Cache
|
||||
...\Acme\SuperApp\Cache\1.0
|
||||
OPINION: This function appends "Cache" to the `CSIDL_LOCAL_APPDATA` value.
|
||||
This can be disabled with the `opinion=False` option.
|
||||
"""
|
||||
if system == "win32":
|
||||
if appauthor is None:
|
||||
appauthor = appname
|
||||
path = os.path.normpath(_get_win_folder("CSIDL_LOCAL_APPDATA"))
|
||||
# When using Python 2, return paths as bytes on Windows like we do on
|
||||
# other operating systems. See helper function docs for more details.
|
||||
if not PY3 and isinstance(path, unicode):
|
||||
path = _win_path_to_bytes(path)
|
||||
if appname:
|
||||
if appauthor is not False:
|
||||
path = os.path.join(path, appauthor, appname)
|
||||
else:
|
||||
path = os.path.join(path, appname)
|
||||
if opinion:
|
||||
path = os.path.join(path, "Cache")
|
||||
elif system == 'darwin':
|
||||
path = os.path.expanduser('~/Library/Caches')
|
||||
if appname:
|
||||
path = os.path.join(path, appname)
|
||||
else:
|
||||
path = os.getenv('XDG_CACHE_HOME', os.path.expanduser('~/.cache'))
|
||||
if appname:
|
||||
path = os.path.join(path, appname)
|
||||
if appname and version:
|
||||
path = os.path.join(path, version)
|
||||
return path
|
||||
|
||||
|
||||
def user_state_dir(appname=None, appauthor=None, version=None, roaming=False):
|
||||
r"""Return full path to the user-specific state dir for this application.
|
||||
|
||||
"appname" is the name of application.
|
||||
If None, just the system directory is returned.
|
||||
"appauthor" (only used on Windows) is the name of the
|
||||
appauthor or distributing body for this application. Typically
|
||||
it is the owning company name. This falls back to appname. You may
|
||||
pass False to disable it.
|
||||
"version" is an optional version path element to append to the
|
||||
path. You might want to use this if you want multiple versions
|
||||
of your app to be able to run independently. If used, this
|
||||
would typically be "<major>.<minor>".
|
||||
Only applied when appname is present.
|
||||
"roaming" (boolean, default False) can be set True to use the Windows
|
||||
roaming appdata directory. That means that for users on a Windows
|
||||
network setup for roaming profiles, this user data will be
|
||||
sync'd on login. See
|
||||
<http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
|
||||
for a discussion of issues.
|
||||
|
||||
Typical user state directories are:
|
||||
Mac OS X: same as user_data_dir
|
||||
Unix: ~/.local/state/<AppName> # or in $XDG_STATE_HOME, if defined
|
||||
Win *: same as user_data_dir
|
||||
|
||||
For Unix, we follow this Debian proposal <https://wiki.debian.org/XDGBaseDirectorySpecification#state>
|
||||
to extend the XDG spec and support $XDG_STATE_HOME.
|
||||
|
||||
That means, by default "~/.local/state/<AppName>".
|
||||
"""
|
||||
if system in ["win32", "darwin"]:
|
||||
path = user_data_dir(appname, appauthor, None, roaming)
|
||||
else:
|
||||
path = os.getenv('XDG_STATE_HOME', os.path.expanduser("~/.local/state"))
|
||||
if appname:
|
||||
path = os.path.join(path, appname)
|
||||
if appname and version:
|
||||
path = os.path.join(path, version)
|
||||
return path
|
||||
|
||||
|
||||
def user_log_dir(appname=None, appauthor=None, version=None, opinion=True):
|
||||
r"""Return full path to the user-specific log dir for this application.
|
||||
|
||||
"appname" is the name of application.
|
||||
If None, just the system directory is returned.
|
||||
"appauthor" (only used on Windows) is the name of the
|
||||
appauthor or distributing body for this application. Typically
|
||||
it is the owning company name. This falls back to appname. You may
|
||||
pass False to disable it.
|
||||
"version" is an optional version path element to append to the
|
||||
path. You might want to use this if you want multiple versions
|
||||
of your app to be able to run independently. If used, this
|
||||
would typically be "<major>.<minor>".
|
||||
Only applied when appname is present.
|
||||
"opinion" (boolean) can be False to disable the appending of
|
||||
"Logs" to the base app data dir for Windows, and "log" to the
|
||||
base cache dir for Unix. See discussion below.
|
||||
|
||||
Typical user log directories are:
|
||||
Mac OS X: ~/Library/Logs/<AppName>
|
||||
Unix: ~/.cache/<AppName>/log # or under $XDG_CACHE_HOME if defined
|
||||
Win XP: C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>\Logs
|
||||
Vista: C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Logs
|
||||
|
||||
On Windows the only suggestion in the MSDN docs is that local settings
|
||||
go in the `CSIDL_LOCAL_APPDATA` directory. (Note: I'm interested in
|
||||
examples of what some windows apps use for a logs dir.)
|
||||
|
||||
OPINION: This function appends "Logs" to the `CSIDL_LOCAL_APPDATA`
|
||||
value for Windows and appends "log" to the user cache dir for Unix.
|
||||
This can be disabled with the `opinion=False` option.
|
||||
"""
|
||||
if system == "darwin":
|
||||
path = os.path.join(
|
||||
os.path.expanduser('~/Library/Logs'),
|
||||
appname)
|
||||
elif system == "win32":
|
||||
path = user_data_dir(appname, appauthor, version)
|
||||
version = False
|
||||
if opinion:
|
||||
path = os.path.join(path, "Logs")
|
||||
else:
|
||||
path = user_cache_dir(appname, appauthor, version)
|
||||
version = False
|
||||
if opinion:
|
||||
path = os.path.join(path, "log")
|
||||
if appname and version:
|
||||
path = os.path.join(path, version)
|
||||
return path
|
||||
|
||||
|
||||
class AppDirs(object):
|
||||
"""Convenience wrapper for getting application dirs."""
|
||||
def __init__(self, appname=None, appauthor=None, version=None,
|
||||
roaming=False, multipath=False):
|
||||
self.appname = appname
|
||||
self.appauthor = appauthor
|
||||
self.version = version
|
||||
self.roaming = roaming
|
||||
self.multipath = multipath
|
||||
|
||||
@property
|
||||
def user_data_dir(self):
|
||||
return user_data_dir(self.appname, self.appauthor,
|
||||
version=self.version, roaming=self.roaming)
|
||||
|
||||
@property
|
||||
def site_data_dir(self):
|
||||
return site_data_dir(self.appname, self.appauthor,
|
||||
version=self.version, multipath=self.multipath)
|
||||
|
||||
@property
|
||||
def user_config_dir(self):
|
||||
return user_config_dir(self.appname, self.appauthor,
|
||||
version=self.version, roaming=self.roaming)
|
||||
|
||||
@property
|
||||
def site_config_dir(self):
|
||||
return site_config_dir(self.appname, self.appauthor,
|
||||
version=self.version, multipath=self.multipath)
|
||||
|
||||
@property
|
||||
def user_cache_dir(self):
|
||||
return user_cache_dir(self.appname, self.appauthor,
|
||||
version=self.version)
|
||||
|
||||
@property
|
||||
def user_state_dir(self):
|
||||
return user_state_dir(self.appname, self.appauthor,
|
||||
version=self.version)
|
||||
|
||||
@property
|
||||
def user_log_dir(self):
|
||||
return user_log_dir(self.appname, self.appauthor,
|
||||
version=self.version)
|
||||
|
||||
|
||||
#---- internal support stuff
|
||||
|
||||
def _get_win_folder_from_registry(csidl_name):
|
||||
"""This is a fallback technique at best. I'm not sure if using the
|
||||
registry for this guarantees us the correct answer for all CSIDL_*
|
||||
names.
|
||||
"""
|
||||
if PY3:
|
||||
import winreg as _winreg
|
||||
else:
|
||||
import _winreg
|
||||
|
||||
shell_folder_name = {
|
||||
"CSIDL_APPDATA": "AppData",
|
||||
"CSIDL_COMMON_APPDATA": "Common AppData",
|
||||
"CSIDL_LOCAL_APPDATA": "Local AppData",
|
||||
}[csidl_name]
|
||||
|
||||
key = _winreg.OpenKey(
|
||||
_winreg.HKEY_CURRENT_USER,
|
||||
r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders"
|
||||
)
|
||||
dir, type = _winreg.QueryValueEx(key, shell_folder_name)
|
||||
return dir
|
||||
|
||||
|
||||
def _get_win_folder_with_pywin32(csidl_name):
|
||||
from win32com.shell import shellcon, shell
|
||||
dir = shell.SHGetFolderPath(0, getattr(shellcon, csidl_name), 0, 0)
|
||||
# Try to make this a unicode path because SHGetFolderPath does
|
||||
# not return unicode strings when there is unicode data in the
|
||||
# path.
|
||||
try:
|
||||
dir = unicode(dir)
|
||||
|
||||
# Downgrade to short path name if have highbit chars. See
|
||||
# <http://bugs.activestate.com/show_bug.cgi?id=85099>.
|
||||
has_high_char = False
|
||||
for c in dir:
|
||||
if ord(c) > 255:
|
||||
has_high_char = True
|
||||
break
|
||||
if has_high_char:
|
||||
try:
|
||||
import win32api
|
||||
dir = win32api.GetShortPathName(dir)
|
||||
except ImportError:
|
||||
pass
|
||||
except UnicodeError:
|
||||
pass
|
||||
return dir
|
||||
|
||||
|
||||
def _get_win_folder_with_ctypes(csidl_name):
|
||||
import ctypes
|
||||
|
||||
csidl_const = {
|
||||
"CSIDL_APPDATA": 26,
|
||||
"CSIDL_COMMON_APPDATA": 35,
|
||||
"CSIDL_LOCAL_APPDATA": 28,
|
||||
}[csidl_name]
|
||||
|
||||
buf = ctypes.create_unicode_buffer(1024)
|
||||
ctypes.windll.shell32.SHGetFolderPathW(None, csidl_const, None, 0, buf)
|
||||
|
||||
# Downgrade to short path name if have highbit chars. See
|
||||
# <http://bugs.activestate.com/show_bug.cgi?id=85099>.
|
||||
has_high_char = False
|
||||
for c in buf:
|
||||
if ord(c) > 255:
|
||||
has_high_char = True
|
||||
break
|
||||
if has_high_char:
|
||||
buf2 = ctypes.create_unicode_buffer(1024)
|
||||
if ctypes.windll.kernel32.GetShortPathNameW(buf.value, buf2, 1024):
|
||||
buf = buf2
|
||||
|
||||
return buf.value
|
||||
|
||||
def _get_win_folder_with_jna(csidl_name):
|
||||
import array
|
||||
from com.sun import jna
|
||||
from com.sun.jna.platform import win32
|
||||
|
||||
buf_size = win32.WinDef.MAX_PATH * 2
|
||||
buf = array.zeros('c', buf_size)
|
||||
shell = win32.Shell32.INSTANCE
|
||||
shell.SHGetFolderPath(None, getattr(win32.ShlObj, csidl_name), None, win32.ShlObj.SHGFP_TYPE_CURRENT, buf)
|
||||
dir = jna.Native.toString(buf.tostring()).rstrip("\0")
|
||||
|
||||
# Downgrade to short path name if have highbit chars. See
|
||||
# <http://bugs.activestate.com/show_bug.cgi?id=85099>.
|
||||
has_high_char = False
|
||||
for c in dir:
|
||||
if ord(c) > 255:
|
||||
has_high_char = True
|
||||
break
|
||||
if has_high_char:
|
||||
buf = array.zeros('c', buf_size)
|
||||
kernel = win32.Kernel32.INSTANCE
|
||||
if kernel.GetShortPathName(dir, buf, buf_size):
|
||||
dir = jna.Native.toString(buf.tostring()).rstrip("\0")
|
||||
|
||||
return dir
|
||||
|
||||
if system == "win32":
|
||||
try:
|
||||
from ctypes import windll
|
||||
_get_win_folder = _get_win_folder_with_ctypes
|
||||
except ImportError:
|
||||
try:
|
||||
import com.sun.jna
|
||||
_get_win_folder = _get_win_folder_with_jna
|
||||
except ImportError:
|
||||
_get_win_folder = _get_win_folder_from_registry
|
||||
|
||||
|
||||
def _win_path_to_bytes(path):
|
||||
"""Encode Windows paths to bytes. Only used on Python 2.
|
||||
|
||||
Motivation is to be consistent with other operating systems where paths
|
||||
are also returned as bytes. This avoids problems mixing bytes and Unicode
|
||||
elsewhere in the codebase. For more details and discussion see
|
||||
<https://github.com/pypa/pip/issues/3463>.
|
||||
|
||||
If encoding using ASCII and MBCS fails, return the original Unicode path.
|
||||
"""
|
||||
for encoding in ('ASCII', 'MBCS'):
|
||||
try:
|
||||
return path.encode(encoding)
|
||||
except (UnicodeEncodeError, LookupError):
|
||||
pass
|
||||
return path
|
||||
|
||||
|
||||
#---- self test code
|
||||
|
||||
if __name__ == "__main__":
|
||||
appname = "MyApp"
|
||||
appauthor = "MyCompany"
|
||||
|
||||
props = ("user_data_dir",
|
||||
"user_config_dir",
|
||||
"user_cache_dir",
|
||||
"user_state_dir",
|
||||
"user_log_dir",
|
||||
"site_data_dir",
|
||||
"site_config_dir")
|
||||
|
||||
print("-- app dirs %s --" % __version__)
|
||||
|
||||
print("-- app dirs (with optional 'version')")
|
||||
dirs = AppDirs(appname, appauthor, version="1.0")
|
||||
for prop in props:
|
||||
print("%s: %s" % (prop, getattr(dirs, prop)))
|
||||
|
||||
print("\n-- app dirs (without optional 'version')")
|
||||
dirs = AppDirs(appname, appauthor)
|
||||
for prop in props:
|
||||
print("%s: %s" % (prop, getattr(dirs, prop)))
|
||||
|
||||
print("\n-- app dirs (without optional 'appauthor')")
|
||||
dirs = AppDirs(appname)
|
||||
for prop in props:
|
||||
print("%s: %s" % (prop, getattr(dirs, prop)))
|
||||
|
||||
print("\n-- app dirs (with disabled 'appauthor')")
|
||||
dirs = AppDirs(appname, appauthor=False)
|
||||
for prop in props:
|
||||
print("%s: %s" % (prop, getattr(dirs, prop)))
|
||||
@@ -1,11 +1,18 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
"""CacheControl import Interface.
|
||||
|
||||
Make it easy to import from cachecontrol without long namespaces.
|
||||
"""
|
||||
__author__ = "Eric Larson"
|
||||
__email__ = "eric@ionrock.org"
|
||||
__version__ = "0.12.6"
|
||||
__version__ = "0.12.11"
|
||||
|
||||
from .wrapper import CacheControl
|
||||
from .adapter import CacheControlAdapter
|
||||
from .controller import CacheController
|
||||
|
||||
import logging
|
||||
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import logging
|
||||
|
||||
from pip._vendor import requests
|
||||
|
||||
@@ -1,16 +1,20 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import types
|
||||
import functools
|
||||
import zlib
|
||||
|
||||
from pip._vendor.requests.adapters import HTTPAdapter
|
||||
|
||||
from .controller import CacheController
|
||||
from .controller import CacheController, PERMANENT_REDIRECT_STATUSES
|
||||
from .cache import DictCache
|
||||
from .filewrapper import CallbackFileWrapper
|
||||
|
||||
|
||||
class CacheControlAdapter(HTTPAdapter):
|
||||
invalidating_methods = {"PUT", "DELETE"}
|
||||
invalidating_methods = {"PUT", "PATCH", "DELETE"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -93,7 +97,7 @@ class CacheControlAdapter(HTTPAdapter):
|
||||
response = cached_response
|
||||
|
||||
# We always cache the 301 responses
|
||||
elif response.status == 301:
|
||||
elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
|
||||
self.controller.cache_response(request, response)
|
||||
else:
|
||||
# Wrap the response file with a wrapper that will cache the
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
"""
|
||||
The cache object API for implementing caches. The default is a thread
|
||||
safe in-memory dictionary.
|
||||
@@ -10,7 +14,7 @@ class BaseCache(object):
|
||||
def get(self, key):
|
||||
raise NotImplementedError()
|
||||
|
||||
def set(self, key, value):
|
||||
def set(self, key, value, expires=None):
|
||||
raise NotImplementedError()
|
||||
|
||||
def delete(self, key):
|
||||
@@ -29,7 +33,7 @@ class DictCache(BaseCache):
|
||||
def get(self, key):
|
||||
return self.data.get(key, None)
|
||||
|
||||
def set(self, key, value):
|
||||
def set(self, key, value, expires=None):
|
||||
with self.lock:
|
||||
self.data.update({key: value})
|
||||
|
||||
@@ -37,3 +41,25 @@ class DictCache(BaseCache):
|
||||
with self.lock:
|
||||
if key in self.data:
|
||||
self.data.pop(key)
|
||||
|
||||
|
||||
class SeparateBodyBaseCache(BaseCache):
|
||||
"""
|
||||
In this variant, the body is not stored mixed in with the metadata, but is
|
||||
passed in (as a bytes-like object) in a separate call to ``set_body()``.
|
||||
|
||||
That is, the expected interaction pattern is::
|
||||
|
||||
cache.set(key, serialized_metadata)
|
||||
cache.set_body(key)
|
||||
|
||||
Similarly, the body should be loaded separately via ``get_body()``.
|
||||
"""
|
||||
def set_body(self, key, body):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_body(self, key):
|
||||
"""
|
||||
Return the body as file-like object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@@ -1,2 +1,9 @@
|
||||
from .file_cache import FileCache # noqa
|
||||
from .redis_cache import RedisCache # noqa
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .file_cache import FileCache, SeparateBodyFileCache
|
||||
from .redis_cache import RedisCache
|
||||
|
||||
|
||||
__all__ = ["FileCache", "SeparateBodyFileCache", "RedisCache"]
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
from textwrap import dedent
|
||||
|
||||
from ..cache import BaseCache
|
||||
from ..cache import BaseCache, SeparateBodyBaseCache
|
||||
from ..controller import CacheController
|
||||
|
||||
try:
|
||||
@@ -53,7 +57,8 @@ def _secure_open_write(filename, fmode):
|
||||
raise
|
||||
|
||||
|
||||
class FileCache(BaseCache):
|
||||
class _FileCacheMixin:
|
||||
"""Shared implementation for both FileCache variants."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -114,22 +119,27 @@ class FileCache(BaseCache):
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
|
||||
def set(self, key, value):
|
||||
def set(self, key, value, expires=None):
|
||||
name = self._fn(key)
|
||||
self._write(name, value)
|
||||
|
||||
def _write(self, path, data: bytes):
|
||||
"""
|
||||
Safely write the data to the given path.
|
||||
"""
|
||||
# Make sure the directory exists
|
||||
try:
|
||||
os.makedirs(os.path.dirname(name), self.dirmode)
|
||||
os.makedirs(os.path.dirname(path), self.dirmode)
|
||||
except (IOError, OSError):
|
||||
pass
|
||||
|
||||
with self.lock_class(name) as lock:
|
||||
with self.lock_class(path) as lock:
|
||||
# Write our actual file
|
||||
with _secure_open_write(lock.path, self.filemode) as fh:
|
||||
fh.write(value)
|
||||
fh.write(data)
|
||||
|
||||
def delete(self, key):
|
||||
name = self._fn(key)
|
||||
def _delete(self, key, suffix):
|
||||
name = self._fn(key) + suffix
|
||||
if not self.forever:
|
||||
try:
|
||||
os.remove(name)
|
||||
@@ -137,6 +147,38 @@ class FileCache(BaseCache):
|
||||
pass
|
||||
|
||||
|
||||
class FileCache(_FileCacheMixin, BaseCache):
|
||||
"""
|
||||
Traditional FileCache: body is stored in memory, so not suitable for large
|
||||
downloads.
|
||||
"""
|
||||
|
||||
def delete(self, key):
|
||||
self._delete(key, "")
|
||||
|
||||
|
||||
class SeparateBodyFileCache(_FileCacheMixin, SeparateBodyBaseCache):
|
||||
"""
|
||||
Memory-efficient FileCache: body is stored in a separate file, reducing
|
||||
peak memory usage.
|
||||
"""
|
||||
|
||||
def get_body(self, key):
|
||||
name = self._fn(key) + ".body"
|
||||
try:
|
||||
return open(name, "rb")
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
|
||||
def set_body(self, key, body):
|
||||
name = self._fn(key) + ".body"
|
||||
self._write(name, body)
|
||||
|
||||
def delete(self, key):
|
||||
self._delete(key, "")
|
||||
self._delete(key, ".body")
|
||||
|
||||
|
||||
def url_to_file_path(url, filecache):
|
||||
"""Return the file cache path based on the URL.
|
||||
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from __future__ import division
|
||||
|
||||
from datetime import datetime
|
||||
@@ -15,9 +19,11 @@ class RedisCache(BaseCache):
|
||||
def set(self, key, value, expires=None):
|
||||
if not expires:
|
||||
self.conn.set(key, value)
|
||||
else:
|
||||
elif isinstance(expires, datetime):
|
||||
expires = expires - datetime.utcnow()
|
||||
self.conn.setex(key, int(expires.total_seconds()), value)
|
||||
else:
|
||||
self.conn.setex(key, expires, value)
|
||||
|
||||
def delete(self, key):
|
||||
self.conn.delete(key)
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
try:
|
||||
from urllib.parse import urljoin
|
||||
except ImportError:
|
||||
@@ -9,7 +13,6 @@ try:
|
||||
except ImportError:
|
||||
import pickle
|
||||
|
||||
|
||||
# Handle the case where the requests module has been patched to not have
|
||||
# urllib3 bundled as part of its source.
|
||||
try:
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
"""
|
||||
The httplib2 algorithms ported for use with requests.
|
||||
"""
|
||||
@@ -9,7 +13,7 @@ from email.utils import parsedate_tz
|
||||
|
||||
from pip._vendor.requests.structures import CaseInsensitiveDict
|
||||
|
||||
from .cache import DictCache
|
||||
from .cache import DictCache, SeparateBodyBaseCache
|
||||
from .serialize import Serializer
|
||||
|
||||
|
||||
@@ -17,19 +21,20 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
|
||||
|
||||
PERMANENT_REDIRECT_STATUSES = (301, 308)
|
||||
|
||||
|
||||
def parse_uri(uri):
|
||||
"""Parses a URI using the regex given in Appendix B of RFC 3986.
|
||||
|
||||
(scheme, authority, path, query, fragment) = parse_uri(uri)
|
||||
(scheme, authority, path, query, fragment) = parse_uri(uri)
|
||||
"""
|
||||
groups = URI.match(uri).groups()
|
||||
return (groups[1], groups[3], groups[4], groups[6], groups[8])
|
||||
|
||||
|
||||
class CacheController(object):
|
||||
"""An interface to see if request should cached or not.
|
||||
"""
|
||||
"""An interface to see if request should cached or not."""
|
||||
|
||||
def __init__(
|
||||
self, cache=None, cache_etags=True, serializer=None, status_codes=None
|
||||
@@ -37,7 +42,7 @@ class CacheController(object):
|
||||
self.cache = DictCache() if cache is None else cache
|
||||
self.cache_etags = cache_etags
|
||||
self.serializer = serializer or Serializer()
|
||||
self.cacheable_status_codes = status_codes or (200, 203, 300, 301)
|
||||
self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308)
|
||||
|
||||
@classmethod
|
||||
def _urlnorm(cls, uri):
|
||||
@@ -141,23 +146,29 @@ class CacheController(object):
|
||||
logger.debug("No cache entry available")
|
||||
return False
|
||||
|
||||
if isinstance(self.cache, SeparateBodyBaseCache):
|
||||
body_file = self.cache.get_body(cache_url)
|
||||
else:
|
||||
body_file = None
|
||||
|
||||
# Check whether it can be deserialized
|
||||
resp = self.serializer.loads(request, cache_data)
|
||||
resp = self.serializer.loads(request, cache_data, body_file)
|
||||
if not resp:
|
||||
logger.warning("Cache entry deserialization failed, entry ignored")
|
||||
return False
|
||||
|
||||
# If we have a cached 301, return it immediately. We don't
|
||||
# need to test our response for other headers b/c it is
|
||||
# If we have a cached permanent redirect, return it immediately. We
|
||||
# don't need to test our response for other headers b/c it is
|
||||
# intrinsically "cacheable" as it is Permanent.
|
||||
#
|
||||
# See:
|
||||
# https://tools.ietf.org/html/rfc7231#section-6.4.2
|
||||
#
|
||||
# Client can try to refresh the value by repeating the request
|
||||
# with cache busting headers as usual (ie no-cache).
|
||||
if resp.status == 301:
|
||||
if int(resp.status) in PERMANENT_REDIRECT_STATUSES:
|
||||
msg = (
|
||||
'Returning cached "301 Moved Permanently" response '
|
||||
"Returning cached permanent redirect response "
|
||||
"(ignoring date and etag information)"
|
||||
)
|
||||
logger.debug(msg)
|
||||
@@ -244,6 +255,26 @@ class CacheController(object):
|
||||
|
||||
return new_headers
|
||||
|
||||
def _cache_set(self, cache_url, request, response, body=None, expires_time=None):
|
||||
"""
|
||||
Store the data in the cache.
|
||||
"""
|
||||
if isinstance(self.cache, SeparateBodyBaseCache):
|
||||
# We pass in the body separately; just put a placeholder empty
|
||||
# string in the metadata.
|
||||
self.cache.set(
|
||||
cache_url,
|
||||
self.serializer.dumps(request, response, b""),
|
||||
expires=expires_time,
|
||||
)
|
||||
self.cache.set_body(cache_url, body)
|
||||
else:
|
||||
self.cache.set(
|
||||
cache_url,
|
||||
self.serializer.dumps(request, response, body),
|
||||
expires=expires_time,
|
||||
)
|
||||
|
||||
def cache_response(self, request, response, body=None, status_codes=None):
|
||||
"""
|
||||
Algorithm for caching requests.
|
||||
@@ -261,6 +292,11 @@ class CacheController(object):
|
||||
|
||||
response_headers = CaseInsensitiveDict(response.headers)
|
||||
|
||||
if "date" in response_headers:
|
||||
date = calendar.timegm(parsedate_tz(response_headers["date"]))
|
||||
else:
|
||||
date = 0
|
||||
|
||||
# If we've been given a body, our response has a Content-Length, that
|
||||
# Content-Length is valid then we can check to see if the body we've
|
||||
# been given matches the expected size, and if it doesn't we'll just
|
||||
@@ -304,35 +340,62 @@ class CacheController(object):
|
||||
|
||||
# If we've been given an etag, then keep the response
|
||||
if self.cache_etags and "etag" in response_headers:
|
||||
logger.debug("Caching due to etag")
|
||||
self.cache.set(
|
||||
cache_url, self.serializer.dumps(request, response, body=body)
|
||||
)
|
||||
expires_time = 0
|
||||
if response_headers.get("expires"):
|
||||
expires = parsedate_tz(response_headers["expires"])
|
||||
if expires is not None:
|
||||
expires_time = calendar.timegm(expires) - date
|
||||
|
||||
# Add to the cache any 301s. We do this before looking that
|
||||
# the Date headers.
|
||||
elif response.status == 301:
|
||||
logger.debug("Caching permanant redirect")
|
||||
self.cache.set(cache_url, self.serializer.dumps(request, response))
|
||||
expires_time = max(expires_time, 14 * 86400)
|
||||
|
||||
logger.debug("etag object cached for {0} seconds".format(expires_time))
|
||||
logger.debug("Caching due to etag")
|
||||
self._cache_set(cache_url, request, response, body, expires_time)
|
||||
|
||||
# Add to the cache any permanent redirects. We do this before looking
|
||||
# that the Date headers.
|
||||
elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
|
||||
logger.debug("Caching permanent redirect")
|
||||
self._cache_set(cache_url, request, response, b"")
|
||||
|
||||
# Add to the cache if the response headers demand it. If there
|
||||
# is no date header then we can't do anything about expiring
|
||||
# the cache.
|
||||
elif "date" in response_headers:
|
||||
date = calendar.timegm(parsedate_tz(response_headers["date"]))
|
||||
# cache when there is a max-age > 0
|
||||
if "max-age" in cc and cc["max-age"] > 0:
|
||||
logger.debug("Caching b/c date exists and max-age > 0")
|
||||
self.cache.set(
|
||||
cache_url, self.serializer.dumps(request, response, body=body)
|
||||
expires_time = cc["max-age"]
|
||||
self._cache_set(
|
||||
cache_url,
|
||||
request,
|
||||
response,
|
||||
body,
|
||||
expires_time,
|
||||
)
|
||||
|
||||
# If the request can expire, it means we should cache it
|
||||
# in the meantime.
|
||||
elif "expires" in response_headers:
|
||||
if response_headers["expires"]:
|
||||
logger.debug("Caching b/c of expires header")
|
||||
self.cache.set(
|
||||
cache_url, self.serializer.dumps(request, response, body=body)
|
||||
expires = parsedate_tz(response_headers["expires"])
|
||||
if expires is not None:
|
||||
expires_time = calendar.timegm(expires) - date
|
||||
else:
|
||||
expires_time = None
|
||||
|
||||
logger.debug(
|
||||
"Caching b/c of expires header. expires in {0} seconds".format(
|
||||
expires_time
|
||||
)
|
||||
)
|
||||
self._cache_set(
|
||||
cache_url,
|
||||
request,
|
||||
response,
|
||||
body,
|
||||
expires_time,
|
||||
)
|
||||
|
||||
def update_cached_response(self, request, response):
|
||||
@@ -371,6 +434,6 @@ class CacheController(object):
|
||||
cached_response.status = 200
|
||||
|
||||
# update our cache
|
||||
self.cache.set(cache_url, self.serializer.dumps(request, cached_response))
|
||||
self._cache_set(cache_url, request, cached_response)
|
||||
|
||||
return cached_response
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
from io import BytesIO
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from tempfile import NamedTemporaryFile
|
||||
import mmap
|
||||
|
||||
|
||||
class CallbackFileWrapper(object):
|
||||
@@ -11,10 +16,17 @@ class CallbackFileWrapper(object):
|
||||
|
||||
This class uses members with a double underscore (__) leading prefix so as
|
||||
not to accidentally shadow an attribute.
|
||||
|
||||
The data is stored in a temporary file until it is all available. As long
|
||||
as the temporary files directory is disk-based (sometimes it's a
|
||||
memory-backed-``tmpfs`` on Linux), data will be unloaded to disk if memory
|
||||
pressure is high. For small files the disk usually won't be used at all,
|
||||
it'll all be in the filesystem memory cache, so there should be no
|
||||
performance impact.
|
||||
"""
|
||||
|
||||
def __init__(self, fp, callback):
|
||||
self.__buf = BytesIO()
|
||||
self.__buf = NamedTemporaryFile("rb+", delete=True)
|
||||
self.__fp = fp
|
||||
self.__callback = callback
|
||||
|
||||
@@ -49,7 +61,19 @@ class CallbackFileWrapper(object):
|
||||
|
||||
def _close(self):
|
||||
if self.__callback:
|
||||
self.__callback(self.__buf.getvalue())
|
||||
if self.__buf.tell() == 0:
|
||||
# Empty file:
|
||||
result = b""
|
||||
else:
|
||||
# Return the data without actually loading it into memory,
|
||||
# relying on Python's buffer API and mmap(). mmap() just gives
|
||||
# a view directly into the filesystem's memory cache, so it
|
||||
# doesn't result in duplicate memory use.
|
||||
self.__buf.seek(0, 0)
|
||||
result = memoryview(
|
||||
mmap.mmap(self.__buf.fileno(), 0, access=mmap.ACCESS_READ)
|
||||
)
|
||||
self.__callback(result)
|
||||
|
||||
# We assign this to None here, because otherwise we can get into
|
||||
# really tricky problems where the CPython interpreter dead locks
|
||||
@@ -58,9 +82,16 @@ class CallbackFileWrapper(object):
|
||||
# and allows the garbage collector to do it's thing normally.
|
||||
self.__callback = None
|
||||
|
||||
# Closing the temporary file releases memory and frees disk space.
|
||||
# Important when caching big files.
|
||||
self.__buf.close()
|
||||
|
||||
def read(self, amt=None):
|
||||
data = self.__fp.read(amt)
|
||||
self.__buf.write(data)
|
||||
if data:
|
||||
# We may be dealing with b'', a sign that things are over:
|
||||
# it's passed e.g. after we've already closed self.__buf.
|
||||
self.__buf.write(data)
|
||||
if self.__is_fp_closed():
|
||||
self._close()
|
||||
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import calendar
|
||||
import time
|
||||
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
@@ -17,24 +21,18 @@ def _b64_decode_str(s):
|
||||
return _b64_decode_bytes(s).decode("utf8")
|
||||
|
||||
|
||||
class Serializer(object):
|
||||
_default_body_read = object()
|
||||
|
||||
|
||||
class Serializer(object):
|
||||
def dumps(self, request, response, body=None):
|
||||
response_headers = CaseInsensitiveDict(response.headers)
|
||||
|
||||
if body is None:
|
||||
# When a body isn't passed in, we'll read the response. We
|
||||
# also update the response with a new file handler to be
|
||||
# sure it acts as though it was never read.
|
||||
body = response.read(decode_content=False)
|
||||
|
||||
# NOTE: 99% sure this is dead code. I'm only leaving it
|
||||
# here b/c I don't have a test yet to prove
|
||||
# it. Basically, before using
|
||||
# `cachecontrol.filewrapper.CallbackFileWrapper`,
|
||||
# this made an effort to reset the file handle. The
|
||||
# `CallbackFileWrapper` short circuits this code by
|
||||
# setting the body as the content is consumed, the
|
||||
# result being a `body` argument is *always* passed
|
||||
# into cache_response, and in turn,
|
||||
# `Serializer.dump`.
|
||||
response._fp = io.BytesIO(body)
|
||||
|
||||
# NOTE: This is all a bit weird, but it's really important that on
|
||||
@@ -46,7 +44,7 @@ class Serializer(object):
|
||||
# enough to have msgpack know the difference.
|
||||
data = {
|
||||
u"response": {
|
||||
u"body": body,
|
||||
u"body": body, # Empty bytestring if body is stored separately
|
||||
u"headers": dict(
|
||||
(text_type(k), text_type(v)) for k, v in response.headers.items()
|
||||
),
|
||||
@@ -71,7 +69,7 @@ class Serializer(object):
|
||||
|
||||
return b",".join([b"cc=4", msgpack.dumps(data, use_bin_type=True)])
|
||||
|
||||
def loads(self, request, data):
|
||||
def loads(self, request, data, body_file=None):
|
||||
# Short circuit if we've been given an empty set of data
|
||||
if not data:
|
||||
return
|
||||
@@ -94,14 +92,14 @@ class Serializer(object):
|
||||
|
||||
# Dispatch to the actual load method for the given version
|
||||
try:
|
||||
return getattr(self, "_loads_v{}".format(ver))(request, data)
|
||||
return getattr(self, "_loads_v{}".format(ver))(request, data, body_file)
|
||||
|
||||
except AttributeError:
|
||||
# This is a version we don't have a loads function for, so we'll
|
||||
# just treat it as a miss and return None
|
||||
return
|
||||
|
||||
def prepare_response(self, request, cached):
|
||||
def prepare_response(self, request, cached, body_file=None):
|
||||
"""Verify our vary headers match and construct a real urllib3
|
||||
HTTPResponse object.
|
||||
"""
|
||||
@@ -127,7 +125,10 @@ class Serializer(object):
|
||||
cached["response"]["headers"] = headers
|
||||
|
||||
try:
|
||||
body = io.BytesIO(body_raw)
|
||||
if body_file is None:
|
||||
body = io.BytesIO(body_raw)
|
||||
else:
|
||||
body = body_file
|
||||
except TypeError:
|
||||
# This can happen if cachecontrol serialized to v1 format (pickle)
|
||||
# using Python 2. A Python 2 str(byte string) will be unpickled as
|
||||
@@ -139,21 +140,22 @@ class Serializer(object):
|
||||
|
||||
return HTTPResponse(body=body, preload_content=False, **cached["response"])
|
||||
|
||||
def _loads_v0(self, request, data):
|
||||
def _loads_v0(self, request, data, body_file=None):
|
||||
# The original legacy cache data. This doesn't contain enough
|
||||
# information to construct everything we need, so we'll treat this as
|
||||
# a miss.
|
||||
return
|
||||
|
||||
def _loads_v1(self, request, data):
|
||||
def _loads_v1(self, request, data, body_file=None):
|
||||
try:
|
||||
cached = pickle.loads(data)
|
||||
except ValueError:
|
||||
return
|
||||
|
||||
return self.prepare_response(request, cached)
|
||||
return self.prepare_response(request, cached, body_file)
|
||||
|
||||
def _loads_v2(self, request, data):
|
||||
def _loads_v2(self, request, data, body_file=None):
|
||||
assert body_file is None
|
||||
try:
|
||||
cached = json.loads(zlib.decompress(data).decode("utf8"))
|
||||
except (ValueError, zlib.error):
|
||||
@@ -171,18 +173,18 @@ class Serializer(object):
|
||||
for k, v in cached["vary"].items()
|
||||
)
|
||||
|
||||
return self.prepare_response(request, cached)
|
||||
return self.prepare_response(request, cached, body_file)
|
||||
|
||||
def _loads_v3(self, request, data):
|
||||
def _loads_v3(self, request, data, body_file):
|
||||
# Due to Python 2 encoding issues, it's impossible to know for sure
|
||||
# exactly how to load v3 entries, thus we'll treat these as a miss so
|
||||
# that they get rewritten out as v4 entries.
|
||||
return
|
||||
|
||||
def _loads_v4(self, request, data):
|
||||
def _loads_v4(self, request, data, body_file=None):
|
||||
try:
|
||||
cached = msgpack.loads(data, raw=False)
|
||||
except ValueError:
|
||||
return
|
||||
|
||||
return self.prepare_response(request, cached)
|
||||
return self.prepare_response(request, cached, body_file)
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
# SPDX-FileCopyrightText: 2015 Eric Larson
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .adapter import CacheControlAdapter
|
||||
from .cache import DictCache
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from .core import contents, where
|
||||
|
||||
__version__ = "2021.05.30"
|
||||
__all__ = ["contents", "where"]
|
||||
__version__ = "2022.12.07"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,34 +1,20 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
certifi.py
|
||||
~~~~~~~~~~
|
||||
|
||||
This module returns the installation location of cacert.pem or its contents.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
class _PipPatchedCertificate(Exception):
|
||||
pass
|
||||
if sys.version_info >= (3, 11):
|
||||
|
||||
|
||||
try:
|
||||
# Return a certificate file on disk for a standalone pip zipapp running in
|
||||
# an isolated build environment to use. Passing --cert to the standalone
|
||||
# pip does not work since requests calls where() unconditionally on import.
|
||||
_PIP_STANDALONE_CERT = os.environ.get("_PIP_STANDALONE_CERT")
|
||||
if _PIP_STANDALONE_CERT:
|
||||
def where():
|
||||
return _PIP_STANDALONE_CERT
|
||||
raise _PipPatchedCertificate()
|
||||
|
||||
from importlib.resources import path as get_path, read_text
|
||||
from importlib.resources import as_file, files
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where():
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the file
|
||||
# in cases where we're inside of a zipimport situation until someone
|
||||
# actually calls where(), but we don't want to re-extract the file
|
||||
@@ -47,30 +33,76 @@ try:
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = as_file(files("pip._vendor.certifi").joinpath("cacert.pem"))
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return files("pip._vendor.certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
||||
|
||||
elif sys.version_info >= (3, 7):
|
||||
|
||||
from importlib.resources import path as get_path, read_text
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the
|
||||
# file in cases where we're inside of a zipimport situation until
|
||||
# someone actually calls where(), but we don't want to re-extract
|
||||
# the file on every call of where(), so we'll do it once then store
|
||||
# it in a global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you
|
||||
# to manage the cleanup of this file, so it doesn't actually
|
||||
# return a path, it returns a context manager that will give
|
||||
# you the path when you enter it and will do any cleanup when
|
||||
# you leave it. In the common case of not needing a temporary
|
||||
# file, it will just return the file system location and the
|
||||
# __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = get_path("pip._vendor.certifi", "cacert.pem")
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
except _PipPatchedCertificate:
|
||||
pass
|
||||
def contents() -> str:
|
||||
return read_text("pip._vendor.certifi", "cacert.pem", encoding="ascii")
|
||||
|
||||
else:
|
||||
import os
|
||||
import types
|
||||
from typing import Union
|
||||
|
||||
Package = Union[types.ModuleType, str]
|
||||
Resource = Union[str, "os.PathLike"]
|
||||
|
||||
except ImportError:
|
||||
# This fallback will work for Python versions prior to 3.7 that lack the
|
||||
# importlib.resources module but relies on the existing `where` function
|
||||
# so won't address issues with environments like PyOxidizer that don't set
|
||||
# __file__ on modules.
|
||||
def read_text(_module, _path, encoding="ascii"):
|
||||
with open(where(), "r", encoding=encoding) as data:
|
||||
def read_text(
|
||||
package: Package,
|
||||
resource: Resource,
|
||||
encoding: str = 'utf-8',
|
||||
errors: str = 'strict'
|
||||
) -> str:
|
||||
with open(where(), encoding=encoding) as data:
|
||||
return data.read()
|
||||
|
||||
# If we don't have importlib.resources, then we will just do the old logic
|
||||
# of assuming we're on the filesystem and munge the path directly.
|
||||
def where():
|
||||
def where() -> str:
|
||||
f = os.path.dirname(__file__)
|
||||
|
||||
return os.path.join(f, "cacert.pem")
|
||||
|
||||
|
||||
def contents():
|
||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
||||
def contents() -> str:
|
||||
return read_text("pip._vendor.certifi", "cacert.pem", encoding="ascii")
|
||||
|
||||
@@ -15,69 +15,101 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import List, Union
|
||||
|
||||
from .universaldetector import UniversalDetector
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import InputState
|
||||
from .version import __version__, VERSION
|
||||
from .resultdict import ResultDict
|
||||
from .universaldetector import UniversalDetector
|
||||
from .version import VERSION, __version__
|
||||
|
||||
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
|
||||
|
||||
|
||||
__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
|
||||
|
||||
|
||||
def detect(byte_str):
|
||||
def detect(
|
||||
byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
|
||||
) -> ResultDict:
|
||||
"""
|
||||
Detect the encoding of the given byte string.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
:type byte_str: ``bytes`` or ``bytearray``
|
||||
:param should_rename_legacy: Should we rename legacy encodings
|
||||
to their more modern equivalents?
|
||||
:type should_rename_legacy: ``bool``
|
||||
"""
|
||||
if not isinstance(byte_str, bytearray):
|
||||
if not isinstance(byte_str, bytes):
|
||||
raise TypeError('Expected object of type bytes or bytearray, got: '
|
||||
'{}'.format(type(byte_str)))
|
||||
else:
|
||||
byte_str = bytearray(byte_str)
|
||||
detector = UniversalDetector()
|
||||
raise TypeError(
|
||||
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||
)
|
||||
byte_str = bytearray(byte_str)
|
||||
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||
detector.feed(byte_str)
|
||||
return detector.close()
|
||||
|
||||
|
||||
def detect_all(byte_str):
|
||||
def detect_all(
|
||||
byte_str: Union[bytes, bytearray],
|
||||
ignore_threshold: bool = False,
|
||||
should_rename_legacy: bool = False,
|
||||
) -> List[ResultDict]:
|
||||
"""
|
||||
Detect all the possible encodings of the given byte string.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
:type byte_str: ``bytes`` or ``bytearray``
|
||||
:param byte_str: The byte sequence to examine.
|
||||
:type byte_str: ``bytes`` or ``bytearray``
|
||||
:param ignore_threshold: Include encodings that are below
|
||||
``UniversalDetector.MINIMUM_THRESHOLD``
|
||||
in results.
|
||||
:type ignore_threshold: ``bool``
|
||||
:param should_rename_legacy: Should we rename legacy encodings
|
||||
to their more modern equivalents?
|
||||
:type should_rename_legacy: ``bool``
|
||||
"""
|
||||
if not isinstance(byte_str, bytearray):
|
||||
if not isinstance(byte_str, bytes):
|
||||
raise TypeError('Expected object of type bytes or bytearray, got: '
|
||||
'{}'.format(type(byte_str)))
|
||||
else:
|
||||
byte_str = bytearray(byte_str)
|
||||
raise TypeError(
|
||||
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||
)
|
||||
byte_str = bytearray(byte_str)
|
||||
|
||||
detector = UniversalDetector()
|
||||
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||
detector.feed(byte_str)
|
||||
detector.close()
|
||||
|
||||
if detector._input_state == InputState.HIGH_BYTE:
|
||||
results = []
|
||||
for prober in detector._charset_probers:
|
||||
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
|
||||
charset_name = prober.charset_name
|
||||
lower_charset_name = prober.charset_name.lower()
|
||||
if detector.input_state == InputState.HIGH_BYTE:
|
||||
results: List[ResultDict] = []
|
||||
probers: List[CharSetProber] = []
|
||||
for prober in detector.charset_probers:
|
||||
if isinstance(prober, CharSetGroupProber):
|
||||
probers.extend(p for p in prober.probers)
|
||||
else:
|
||||
probers.append(prober)
|
||||
for prober in probers:
|
||||
if ignore_threshold or prober.get_confidence() > detector.MINIMUM_THRESHOLD:
|
||||
charset_name = prober.charset_name or ""
|
||||
lower_charset_name = charset_name.lower()
|
||||
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||
# extra Windows-specific bytes
|
||||
if lower_charset_name.startswith('iso-8859'):
|
||||
if detector._has_win_bytes:
|
||||
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
|
||||
charset_name)
|
||||
results.append({
|
||||
'encoding': charset_name,
|
||||
'confidence': prober.get_confidence(),
|
||||
'language': prober.language,
|
||||
})
|
||||
if lower_charset_name.startswith("iso-8859") and detector.has_win_bytes:
|
||||
charset_name = detector.ISO_WIN_MAP.get(
|
||||
lower_charset_name, charset_name
|
||||
)
|
||||
# Rename legacy encodings with superset encodings if asked
|
||||
if should_rename_legacy:
|
||||
charset_name = detector.LEGACY_MAP.get(
|
||||
charset_name.lower(), charset_name
|
||||
)
|
||||
results.append(
|
||||
{
|
||||
"encoding": charset_name,
|
||||
"confidence": prober.get_confidence(),
|
||||
"language": prober.language,
|
||||
}
|
||||
)
|
||||
if len(results) > 0:
|
||||
return sorted(results, key=lambda result: -result['confidence'])
|
||||
return sorted(results, key=lambda result: -result["confidence"])
|
||||
|
||||
return [detector.result]
|
||||
|
||||
@@ -42,9 +42,9 @@
|
||||
|
||||
BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||
|
||||
#Char to FreqOrder table
|
||||
# Char to FreqOrder table
|
||||
BIG5_TABLE_SIZE = 5376
|
||||
|
||||
# fmt: off
|
||||
BIG5_CHAR_TO_FREQ_ORDER = (
|
||||
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
|
||||
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
|
||||
@@ -383,4 +383,4 @@ BIG5_CHAR_TO_FREQ_ORDER = (
|
||||
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
|
||||
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
|
||||
)
|
||||
|
||||
# fmt: on
|
||||
|
||||
@@ -25,23 +25,23 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import Big5DistributionAnalysis
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .mbcssm import BIG5_SM_MODEL
|
||||
|
||||
|
||||
class Big5Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
super(Big5Prober, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
|
||||
self.distribution_analyzer = Big5DistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "Big5"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Chinese"
|
||||
|
||||
@@ -25,40 +25,58 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .euctwfreq import (EUCTW_CHAR_TO_FREQ_ORDER, EUCTW_TABLE_SIZE,
|
||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .euckrfreq import (EUCKR_CHAR_TO_FREQ_ORDER, EUCKR_TABLE_SIZE,
|
||||
EUCKR_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .gb2312freq import (GB2312_CHAR_TO_FREQ_ORDER, GB2312_TABLE_SIZE,
|
||||
GB2312_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE,
|
||||
BIG5_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
|
||||
JIS_TYPICAL_DISTRIBUTION_RATIO)
|
||||
from typing import Tuple, Union
|
||||
|
||||
from .big5freq import (
|
||||
BIG5_CHAR_TO_FREQ_ORDER,
|
||||
BIG5_TABLE_SIZE,
|
||||
BIG5_TYPICAL_DISTRIBUTION_RATIO,
|
||||
)
|
||||
from .euckrfreq import (
|
||||
EUCKR_CHAR_TO_FREQ_ORDER,
|
||||
EUCKR_TABLE_SIZE,
|
||||
EUCKR_TYPICAL_DISTRIBUTION_RATIO,
|
||||
)
|
||||
from .euctwfreq import (
|
||||
EUCTW_CHAR_TO_FREQ_ORDER,
|
||||
EUCTW_TABLE_SIZE,
|
||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO,
|
||||
)
|
||||
from .gb2312freq import (
|
||||
GB2312_CHAR_TO_FREQ_ORDER,
|
||||
GB2312_TABLE_SIZE,
|
||||
GB2312_TYPICAL_DISTRIBUTION_RATIO,
|
||||
)
|
||||
from .jisfreq import (
|
||||
JIS_CHAR_TO_FREQ_ORDER,
|
||||
JIS_TABLE_SIZE,
|
||||
JIS_TYPICAL_DISTRIBUTION_RATIO,
|
||||
)
|
||||
from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE
|
||||
|
||||
|
||||
class CharDistributionAnalysis(object):
|
||||
class CharDistributionAnalysis:
|
||||
ENOUGH_DATA_THRESHOLD = 1024
|
||||
SURE_YES = 0.99
|
||||
SURE_NO = 0.01
|
||||
MINIMUM_DATA_THRESHOLD = 3
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
# Mapping table to get frequency order from char order (get from
|
||||
# GetOrder())
|
||||
self._char_to_freq_order = None
|
||||
self._table_size = None # Size of above table
|
||||
self._char_to_freq_order: Tuple[int, ...] = tuple()
|
||||
self._table_size = 0 # Size of above table
|
||||
# This is a constant value which varies from language to language,
|
||||
# used in calculating confidence. See
|
||||
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
||||
# for further detail.
|
||||
self.typical_distribution_ratio = None
|
||||
self._done = None
|
||||
self._total_chars = None
|
||||
self._freq_chars = None
|
||||
self.typical_distribution_ratio = 0.0
|
||||
self._done = False
|
||||
self._total_chars = 0
|
||||
self._freq_chars = 0
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
"""reset analyser, clear any state"""
|
||||
# If this flag is set to True, detection is done and conclusion has
|
||||
# been made
|
||||
@@ -67,7 +85,7 @@ class CharDistributionAnalysis(object):
|
||||
# The number of characters whose frequency order is less than 512
|
||||
self._freq_chars = 0
|
||||
|
||||
def feed(self, char, char_len):
|
||||
def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
|
||||
"""feed a character with known length"""
|
||||
if char_len == 2:
|
||||
# we only care about 2-bytes character in our distribution analysis
|
||||
@@ -81,7 +99,7 @@ class CharDistributionAnalysis(object):
|
||||
if 512 > self._char_to_freq_order[order]:
|
||||
self._freq_chars += 1
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
"""return confidence based on existing data"""
|
||||
# if we didn't receive any character in our consideration range,
|
||||
# return negative answer
|
||||
@@ -89,20 +107,21 @@ class CharDistributionAnalysis(object):
|
||||
return self.SURE_NO
|
||||
|
||||
if self._total_chars != self._freq_chars:
|
||||
r = (self._freq_chars / ((self._total_chars - self._freq_chars)
|
||||
* self.typical_distribution_ratio))
|
||||
r = self._freq_chars / (
|
||||
(self._total_chars - self._freq_chars) * self.typical_distribution_ratio
|
||||
)
|
||||
if r < self.SURE_YES:
|
||||
return r
|
||||
|
||||
# normalize confidence (we don't want to be 100% sure)
|
||||
return self.SURE_YES
|
||||
|
||||
def got_enough_data(self):
|
||||
def got_enough_data(self) -> bool:
|
||||
# It is not necessary to receive all data to draw conclusion.
|
||||
# For charset detection, certain amount of data is enough
|
||||
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, _: Union[bytes, bytearray]) -> int:
|
||||
# We do not handle characters based on the original encoding string,
|
||||
# but convert this encoding string to a number, here called order.
|
||||
# This allows multiple encodings of a language to share one frequency
|
||||
@@ -111,13 +130,13 @@ class CharDistributionAnalysis(object):
|
||||
|
||||
|
||||
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
super(EUCTWDistributionAnalysis, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = EUCTW_TABLE_SIZE
|
||||
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for euc-TW encoding, we are interested
|
||||
# first byte range: 0xc4 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
@@ -125,18 +144,17 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||
first_char = byte_str[0]
|
||||
if first_char >= 0xC4:
|
||||
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
|
||||
else:
|
||||
return -1
|
||||
return -1
|
||||
|
||||
|
||||
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
super(EUCKRDistributionAnalysis, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = EUCKR_TABLE_SIZE
|
||||
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for euc-KR encoding, we are interested
|
||||
# first byte range: 0xb0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
@@ -144,18 +162,32 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||
first_char = byte_str[0]
|
||||
if first_char >= 0xB0:
|
||||
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
|
||||
else:
|
||||
return -1
|
||||
return -1
|
||||
|
||||
|
||||
class JOHABDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = EUCKR_TABLE_SIZE
|
||||
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
first_char = byte_str[0]
|
||||
if 0x88 <= first_char < 0xD4:
|
||||
code = first_char * 256 + byte_str[1]
|
||||
return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
|
||||
return -1
|
||||
|
||||
|
||||
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
super(GB2312DistributionAnalysis, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = GB2312_TABLE_SIZE
|
||||
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for GB2312 encoding, we are interested
|
||||
# first byte range: 0xb0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
@@ -163,18 +195,17 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||
first_char, second_char = byte_str[0], byte_str[1]
|
||||
if (first_char >= 0xB0) and (second_char >= 0xA1):
|
||||
return 94 * (first_char - 0xB0) + second_char - 0xA1
|
||||
else:
|
||||
return -1
|
||||
return -1
|
||||
|
||||
|
||||
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
super(Big5DistributionAnalysis, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = BIG5_TABLE_SIZE
|
||||
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for big5 encoding, we are interested
|
||||
# first byte range: 0xa4 -- 0xfe
|
||||
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
||||
@@ -183,28 +214,26 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||
if first_char >= 0xA4:
|
||||
if second_char >= 0xA1:
|
||||
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
|
||||
else:
|
||||
return 157 * (first_char - 0xA4) + second_char - 0x40
|
||||
else:
|
||||
return -1
|
||||
return 157 * (first_char - 0xA4) + second_char - 0x40
|
||||
return -1
|
||||
|
||||
|
||||
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
super(SJISDistributionAnalysis, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = JIS_TABLE_SIZE
|
||||
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for sjis encoding, we are interested
|
||||
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
||||
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
||||
# no validation needed here. State machine has done that
|
||||
first_char, second_char = byte_str[0], byte_str[1]
|
||||
if (first_char >= 0x81) and (first_char <= 0x9F):
|
||||
if 0x81 <= first_char <= 0x9F:
|
||||
order = 188 * (first_char - 0x81)
|
||||
elif (first_char >= 0xE0) and (first_char <= 0xEF):
|
||||
elif 0xE0 <= first_char <= 0xEF:
|
||||
order = 188 * (first_char - 0xE0 + 31)
|
||||
else:
|
||||
return -1
|
||||
@@ -215,19 +244,18 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||
|
||||
|
||||
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
super(EUCJPDistributionAnalysis, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = JIS_TABLE_SIZE
|
||||
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for euc-JP encoding, we are interested
|
||||
# first byte range: 0xa0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
# no validation needed here. State machine has done that
|
||||
char = byte_str[0]
|
||||
if char >= 0xA0:
|
||||
return 94 * (char - 0xA1) + byte_str[1] - 0xa1
|
||||
else:
|
||||
return -1
|
||||
return 94 * (char - 0xA1) + byte_str[1] - 0xA1
|
||||
return -1
|
||||
|
||||
@@ -25,29 +25,30 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .enums import ProbingState
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import LanguageFilter, ProbingState
|
||||
|
||||
|
||||
class CharSetGroupProber(CharSetProber):
|
||||
def __init__(self, lang_filter=None):
|
||||
super(CharSetGroupProber, self).__init__(lang_filter=lang_filter)
|
||||
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||
super().__init__(lang_filter=lang_filter)
|
||||
self._active_num = 0
|
||||
self.probers = []
|
||||
self._best_guess_prober = None
|
||||
self.probers: List[CharSetProber] = []
|
||||
self._best_guess_prober: Optional[CharSetProber] = None
|
||||
|
||||
def reset(self):
|
||||
super(CharSetGroupProber, self).reset()
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self._active_num = 0
|
||||
for prober in self.probers:
|
||||
if prober:
|
||||
prober.reset()
|
||||
prober.active = True
|
||||
self._active_num += 1
|
||||
prober.reset()
|
||||
prober.active = True
|
||||
self._active_num += 1
|
||||
self._best_guess_prober = None
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> Optional[str]:
|
||||
if not self._best_guess_prober:
|
||||
self.get_confidence()
|
||||
if not self._best_guess_prober:
|
||||
@@ -55,17 +56,15 @@ class CharSetGroupProber(CharSetProber):
|
||||
return self._best_guess_prober.charset_name
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> Optional[str]:
|
||||
if not self._best_guess_prober:
|
||||
self.get_confidence()
|
||||
if not self._best_guess_prober:
|
||||
return None
|
||||
return self._best_guess_prober.language
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
for prober in self.probers:
|
||||
if not prober:
|
||||
continue
|
||||
if not prober.active:
|
||||
continue
|
||||
state = prober.feed(byte_str)
|
||||
@@ -75,7 +74,7 @@ class CharSetGroupProber(CharSetProber):
|
||||
self._best_guess_prober = prober
|
||||
self._state = ProbingState.FOUND_IT
|
||||
return self.state
|
||||
elif state == ProbingState.NOT_ME:
|
||||
if state == ProbingState.NOT_ME:
|
||||
prober.active = False
|
||||
self._active_num -= 1
|
||||
if self._active_num <= 0:
|
||||
@@ -83,22 +82,22 @@ class CharSetGroupProber(CharSetProber):
|
||||
return self.state
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
state = self.state
|
||||
if state == ProbingState.FOUND_IT:
|
||||
return 0.99
|
||||
elif state == ProbingState.NOT_ME:
|
||||
if state == ProbingState.NOT_ME:
|
||||
return 0.01
|
||||
best_conf = 0.0
|
||||
self._best_guess_prober = None
|
||||
for prober in self.probers:
|
||||
if not prober:
|
||||
continue
|
||||
if not prober.active:
|
||||
self.logger.debug('%s not active', prober.charset_name)
|
||||
self.logger.debug("%s not active", prober.charset_name)
|
||||
continue
|
||||
conf = prober.get_confidence()
|
||||
self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf)
|
||||
self.logger.debug(
|
||||
"%s %s confidence = %s", prober.charset_name, prober.language, conf
|
||||
)
|
||||
if best_conf < conf:
|
||||
best_conf = conf
|
||||
self._best_guess_prober = prober
|
||||
|
||||
@@ -28,54 +28,62 @@
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional, Union
|
||||
|
||||
from .enums import ProbingState
|
||||
from .enums import LanguageFilter, ProbingState
|
||||
|
||||
INTERNATIONAL_WORDS_PATTERN = re.compile(
|
||||
b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
|
||||
)
|
||||
|
||||
|
||||
class CharSetProber(object):
|
||||
class CharSetProber:
|
||||
|
||||
SHORTCUT_THRESHOLD = 0.95
|
||||
|
||||
def __init__(self, lang_filter=None):
|
||||
self._state = None
|
||||
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||
self._state = ProbingState.DETECTING
|
||||
self.active = True
|
||||
self.lang_filter = lang_filter
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
self._state = ProbingState.DETECTING
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> Optional[str]:
|
||||
return None
|
||||
|
||||
def feed(self, buf):
|
||||
pass
|
||||
@property
|
||||
def language(self) -> Optional[str]:
|
||||
raise NotImplementedError
|
||||
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
def state(self) -> ProbingState:
|
||||
return self._state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
return 0.0
|
||||
|
||||
@staticmethod
|
||||
def filter_high_byte_only(buf):
|
||||
buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
|
||||
def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
|
||||
buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
|
||||
return buf
|
||||
|
||||
@staticmethod
|
||||
def filter_international_words(buf):
|
||||
def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
|
||||
"""
|
||||
We define three types of bytes:
|
||||
alphabet: english alphabets [a-zA-Z]
|
||||
international: international characters [\x80-\xFF]
|
||||
marker: everything else [^a-zA-Z\x80-\xFF]
|
||||
|
||||
The input buffer can be thought to contain a series of words delimited
|
||||
by markers. This function works to filter all words that contain at
|
||||
least one international character. All contiguous sequences of markers
|
||||
are replaced by a single space ascii character.
|
||||
|
||||
This filter applies to all scripts which do not use English characters.
|
||||
"""
|
||||
filtered = bytearray()
|
||||
@@ -83,8 +91,7 @@ class CharSetProber(object):
|
||||
# This regex expression filters out only words that have at-least one
|
||||
# international character. The word may include one marker character at
|
||||
# the end.
|
||||
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
|
||||
buf)
|
||||
words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
|
||||
|
||||
for word in words:
|
||||
filtered.extend(word[:-1])
|
||||
@@ -94,20 +101,17 @@ class CharSetProber(object):
|
||||
# similarly across all languages and may thus have similar
|
||||
# frequencies).
|
||||
last_char = word[-1:]
|
||||
if not last_char.isalpha() and last_char < b'\x80':
|
||||
last_char = b' '
|
||||
if not last_char.isalpha() and last_char < b"\x80":
|
||||
last_char = b" "
|
||||
filtered.extend(last_char)
|
||||
|
||||
return filtered
|
||||
|
||||
@staticmethod
|
||||
def filter_with_english_letters(buf):
|
||||
def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
|
||||
"""
|
||||
Returns a copy of ``buf`` that retains only the sequences of English
|
||||
alphabet and high byte characters that are not between <> characters.
|
||||
Also retains English alphabet and high byte characters immediately
|
||||
before occurrences of >.
|
||||
|
||||
This filter can be applied to all scripts which contain both English
|
||||
characters and extended ASCII characters, but is currently only used by
|
||||
``Latin1Prober``.
|
||||
@@ -115,26 +119,24 @@ class CharSetProber(object):
|
||||
filtered = bytearray()
|
||||
in_tag = False
|
||||
prev = 0
|
||||
buf = memoryview(buf).cast("c")
|
||||
|
||||
for curr in range(len(buf)):
|
||||
# Slice here to get bytes instead of an int with Python 3
|
||||
buf_char = buf[curr:curr + 1]
|
||||
# Check if we're coming out of or entering an HTML tag
|
||||
if buf_char == b'>':
|
||||
for curr, buf_char in enumerate(buf):
|
||||
# Check if we're coming out of or entering an XML tag
|
||||
|
||||
# https://github.com/python/typeshed/issues/8182
|
||||
if buf_char == b">": # type: ignore[comparison-overlap]
|
||||
prev = curr + 1
|
||||
in_tag = False
|
||||
elif buf_char == b'<':
|
||||
in_tag = True
|
||||
|
||||
# If current character is not extended-ASCII and not alphabetic...
|
||||
if buf_char < b'\x80' and not buf_char.isalpha():
|
||||
# ...and we're not in a tag
|
||||
# https://github.com/python/typeshed/issues/8182
|
||||
elif buf_char == b"<": # type: ignore[comparison-overlap]
|
||||
if curr > prev and not in_tag:
|
||||
# Keep everything after last non-extended-ASCII,
|
||||
# non-alphabetic character
|
||||
filtered.extend(buf[prev:curr])
|
||||
# Output a space to delimit stretch we kept
|
||||
filtered.extend(b' ')
|
||||
prev = curr + 1
|
||||
filtered.extend(b" ")
|
||||
in_tag = True
|
||||
|
||||
# If we're not in a tag...
|
||||
if not in_tag:
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
|
||||
|
||||
@@ -12,17 +12,21 @@ If no paths are provided, it takes its input from stdin.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
from pip._vendor.chardet import __version__
|
||||
from pip._vendor.chardet.compat import PY2
|
||||
from pip._vendor.chardet.universaldetector import UniversalDetector
|
||||
from .. import __version__
|
||||
from ..universaldetector import UniversalDetector
|
||||
|
||||
|
||||
def description_of(lines, name='stdin'):
|
||||
def description_of(
|
||||
lines: Iterable[bytes],
|
||||
name: str = "stdin",
|
||||
minimal: bool = False,
|
||||
should_rename_legacy: bool = False,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Return a string describing the probable encoding of a file or
|
||||
list of strings.
|
||||
@@ -31,8 +35,11 @@ def description_of(lines, name='stdin'):
|
||||
:type lines: Iterable of bytes
|
||||
:param name: Name of file or collection of lines
|
||||
:type name: str
|
||||
:param should_rename_legacy: Should we rename legacy encodings to
|
||||
their more modern equivalents?
|
||||
:type should_rename_legacy: ``bool``
|
||||
"""
|
||||
u = UniversalDetector()
|
||||
u = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||
for line in lines:
|
||||
line = bytearray(line)
|
||||
u.feed(line)
|
||||
@@ -41,16 +48,14 @@ def description_of(lines, name='stdin'):
|
||||
break
|
||||
u.close()
|
||||
result = u.result
|
||||
if PY2:
|
||||
name = name.decode(sys.getfilesystemencoding(), 'ignore')
|
||||
if result['encoding']:
|
||||
return '{}: {} with confidence {}'.format(name, result['encoding'],
|
||||
result['confidence'])
|
||||
else:
|
||||
return '{}: no result'.format(name)
|
||||
if minimal:
|
||||
return result["encoding"]
|
||||
if result["encoding"]:
|
||||
return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
|
||||
return f"{name}: no result"
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
def main(argv: Optional[List[str]] = None) -> None:
|
||||
"""
|
||||
Handles command line arguments and gets things started.
|
||||
|
||||
@@ -60,25 +65,48 @@ def main(argv=None):
|
||||
"""
|
||||
# Get command line arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Takes one or more file paths and reports their detected \
|
||||
encodings")
|
||||
parser.add_argument('input',
|
||||
help='File whose encoding we would like to determine. \
|
||||
(default: stdin)',
|
||||
type=argparse.FileType('rb'), nargs='*',
|
||||
default=[sys.stdin if PY2 else sys.stdin.buffer])
|
||||
parser.add_argument('--version', action='version',
|
||||
version='%(prog)s {}'.format(__version__))
|
||||
description=(
|
||||
"Takes one or more file paths and reports their detected encodings"
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"input",
|
||||
help="File whose encoding we would like to determine. (default: stdin)",
|
||||
type=argparse.FileType("rb"),
|
||||
nargs="*",
|
||||
default=[sys.stdin.buffer],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--minimal",
|
||||
help="Print only the encoding to standard output",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--legacy",
|
||||
help="Rename legacy encodings to more modern ones.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version", action="version", version=f"%(prog)s {__version__}"
|
||||
)
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
for f in args.input:
|
||||
if f.isatty():
|
||||
print("You are running chardetect interactively. Press " +
|
||||
"CTRL-D twice at the start of a blank line to signal the " +
|
||||
"end of your input. If you want help, run chardetect " +
|
||||
"--help\n", file=sys.stderr)
|
||||
print(description_of(f, f.name))
|
||||
print(
|
||||
"You are running chardetect interactively. Press "
|
||||
"CTRL-D twice at the start of a blank line to signal the "
|
||||
"end of your input. If you want help, run chardetect "
|
||||
"--help\n",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
description_of(
|
||||
f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,10 +27,11 @@
|
||||
|
||||
import logging
|
||||
|
||||
from .codingstatemachinedict import CodingStateMachineDict
|
||||
from .enums import MachineState
|
||||
|
||||
|
||||
class CodingStateMachine(object):
|
||||
class CodingStateMachine:
|
||||
"""
|
||||
A state machine to verify a byte sequence for a particular encoding. For
|
||||
each byte the detector receives, it will feed that byte to every active
|
||||
@@ -52,37 +53,38 @@ class CodingStateMachine(object):
|
||||
negative answer for this encoding. Detector will exclude this
|
||||
encoding from consideration from here on.
|
||||
"""
|
||||
def __init__(self, sm):
|
||||
|
||||
def __init__(self, sm: CodingStateMachineDict) -> None:
|
||||
self._model = sm
|
||||
self._curr_byte_pos = 0
|
||||
self._curr_char_len = 0
|
||||
self._curr_state = None
|
||||
self._curr_state = MachineState.START
|
||||
self.active = True
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
self._curr_state = MachineState.START
|
||||
|
||||
def next_state(self, c):
|
||||
def next_state(self, c: int) -> int:
|
||||
# for each byte we get its class
|
||||
# if it is first byte, we also get byte length
|
||||
byte_class = self._model['class_table'][c]
|
||||
byte_class = self._model["class_table"][c]
|
||||
if self._curr_state == MachineState.START:
|
||||
self._curr_byte_pos = 0
|
||||
self._curr_char_len = self._model['char_len_table'][byte_class]
|
||||
self._curr_char_len = self._model["char_len_table"][byte_class]
|
||||
# from byte's class and state_table, we get its next state
|
||||
curr_state = (self._curr_state * self._model['class_factor']
|
||||
+ byte_class)
|
||||
self._curr_state = self._model['state_table'][curr_state]
|
||||
curr_state = self._curr_state * self._model["class_factor"] + byte_class
|
||||
self._curr_state = self._model["state_table"][curr_state]
|
||||
self._curr_byte_pos += 1
|
||||
return self._curr_state
|
||||
|
||||
def get_current_charlen(self):
|
||||
def get_current_charlen(self) -> int:
|
||||
return self._curr_char_len
|
||||
|
||||
def get_coding_state_machine(self):
|
||||
return self._model['name']
|
||||
def get_coding_state_machine(self) -> str:
|
||||
return self._model["name"]
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
return self._model['language']
|
||||
def language(self) -> str:
|
||||
return self._model["language"]
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# Contributor(s):
|
||||
# Dan Blanchard
|
||||
# Ian Cordasco
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
if sys.version_info < (3, 0):
|
||||
PY2 = True
|
||||
PY3 = False
|
||||
string_types = (str, unicode)
|
||||
text_type = unicode
|
||||
iteritems = dict.iteritems
|
||||
else:
|
||||
PY2 = False
|
||||
PY3 = True
|
||||
string_types = (bytes, str)
|
||||
text_type = str
|
||||
iteritems = dict.items
|
||||
@@ -32,8 +32,8 @@ from .mbcssm import CP949_SM_MODEL
|
||||
|
||||
|
||||
class CP949Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
super(CP949Prober, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
|
||||
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
||||
# not different.
|
||||
@@ -41,9 +41,9 @@ class CP949Prober(MultiByteCharSetProber):
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "CP949"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Korean"
|
||||
|
||||
@@ -4,21 +4,26 @@ All of the Enums that are used throughout the chardet package.
|
||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||
"""
|
||||
|
||||
from enum import Enum, Flag
|
||||
|
||||
class InputState(object):
|
||||
|
||||
class InputState:
|
||||
"""
|
||||
This enum represents the different states a universal detector can be in.
|
||||
"""
|
||||
|
||||
PURE_ASCII = 0
|
||||
ESC_ASCII = 1
|
||||
HIGH_BYTE = 2
|
||||
|
||||
|
||||
class LanguageFilter(object):
|
||||
class LanguageFilter(Flag):
|
||||
"""
|
||||
This enum represents the different language filters we can apply to a
|
||||
``UniversalDetector``.
|
||||
"""
|
||||
|
||||
NONE = 0x00
|
||||
CHINESE_SIMPLIFIED = 0x01
|
||||
CHINESE_TRADITIONAL = 0x02
|
||||
JAPANESE = 0x04
|
||||
@@ -29,46 +34,50 @@ class LanguageFilter(object):
|
||||
CJK = CHINESE | JAPANESE | KOREAN
|
||||
|
||||
|
||||
class ProbingState(object):
|
||||
class ProbingState(Enum):
|
||||
"""
|
||||
This enum represents the different states a prober can be in.
|
||||
"""
|
||||
|
||||
DETECTING = 0
|
||||
FOUND_IT = 1
|
||||
NOT_ME = 2
|
||||
|
||||
|
||||
class MachineState(object):
|
||||
class MachineState:
|
||||
"""
|
||||
This enum represents the different states a state machine can be in.
|
||||
"""
|
||||
|
||||
START = 0
|
||||
ERROR = 1
|
||||
ITS_ME = 2
|
||||
|
||||
|
||||
class SequenceLikelihood(object):
|
||||
class SequenceLikelihood:
|
||||
"""
|
||||
This enum represents the likelihood of a character following the previous one.
|
||||
"""
|
||||
|
||||
NEGATIVE = 0
|
||||
UNLIKELY = 1
|
||||
LIKELY = 2
|
||||
POSITIVE = 3
|
||||
|
||||
@classmethod
|
||||
def get_num_categories(cls):
|
||||
def get_num_categories(cls) -> int:
|
||||
""":returns: The number of likelihood categories in the enum."""
|
||||
return 4
|
||||
|
||||
|
||||
class CharacterCategory(object):
|
||||
class CharacterCategory:
|
||||
"""
|
||||
This enum represents the different categories language models for
|
||||
``SingleByteCharsetProber`` put characters into.
|
||||
|
||||
Anything less than CONTROL is considered a letter.
|
||||
"""
|
||||
|
||||
UNDEFINED = 255
|
||||
LINE_BREAK = 254
|
||||
SYMBOL = 253
|
||||
|
||||
@@ -25,11 +25,17 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import LanguageFilter, ProbingState, MachineState
|
||||
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
|
||||
ISO2022KR_SM_MODEL)
|
||||
from .enums import LanguageFilter, MachineState, ProbingState
|
||||
from .escsm import (
|
||||
HZ_SM_MODEL,
|
||||
ISO2022CN_SM_MODEL,
|
||||
ISO2022JP_SM_MODEL,
|
||||
ISO2022KR_SM_MODEL,
|
||||
)
|
||||
|
||||
|
||||
class EscCharSetProber(CharSetProber):
|
||||
@@ -39,8 +45,8 @@ class EscCharSetProber(CharSetProber):
|
||||
identify these encodings.
|
||||
"""
|
||||
|
||||
def __init__(self, lang_filter=None):
|
||||
super(EscCharSetProber, self).__init__(lang_filter=lang_filter)
|
||||
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||
super().__init__(lang_filter=lang_filter)
|
||||
self.coding_sm = []
|
||||
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
|
||||
self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
|
||||
@@ -49,17 +55,15 @@ class EscCharSetProber(CharSetProber):
|
||||
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
|
||||
if self.lang_filter & LanguageFilter.KOREAN:
|
||||
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
|
||||
self.active_sm_count = None
|
||||
self._detected_charset = None
|
||||
self._detected_language = None
|
||||
self._state = None
|
||||
self.active_sm_count = 0
|
||||
self._detected_charset: Optional[str] = None
|
||||
self._detected_language: Optional[str] = None
|
||||
self._state = ProbingState.DETECTING
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
super(EscCharSetProber, self).reset()
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
for coding_sm in self.coding_sm:
|
||||
if not coding_sm:
|
||||
continue
|
||||
coding_sm.active = True
|
||||
coding_sm.reset()
|
||||
self.active_sm_count = len(self.coding_sm)
|
||||
@@ -67,23 +71,20 @@ class EscCharSetProber(CharSetProber):
|
||||
self._detected_language = None
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> Optional[str]:
|
||||
return self._detected_charset
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> Optional[str]:
|
||||
return self._detected_language
|
||||
|
||||
def get_confidence(self):
|
||||
if self._detected_charset:
|
||||
return 0.99
|
||||
else:
|
||||
return 0.00
|
||||
def get_confidence(self) -> float:
|
||||
return 0.99 if self._detected_charset else 0.00
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
for c in byte_str:
|
||||
for coding_sm in self.coding_sm:
|
||||
if not coding_sm or not coding_sm.active:
|
||||
if not coding_sm.active:
|
||||
continue
|
||||
coding_state = coding_sm.next_state(c)
|
||||
if coding_state == MachineState.ERROR:
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@@ -20,227 +20,242 @@
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .codingstatemachinedict import CodingStateMachineDict
|
||||
from .enums import MachineState
|
||||
|
||||
# fmt: off
|
||||
HZ_CLS = (
|
||||
1,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,0,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,4,0,5,2,0, # 78 - 7f
|
||||
1,1,1,1,1,1,1,1, # 80 - 87
|
||||
1,1,1,1,1,1,1,1, # 88 - 8f
|
||||
1,1,1,1,1,1,1,1, # 90 - 97
|
||||
1,1,1,1,1,1,1,1, # 98 - 9f
|
||||
1,1,1,1,1,1,1,1, # a0 - a7
|
||||
1,1,1,1,1,1,1,1, # a8 - af
|
||||
1,1,1,1,1,1,1,1, # b0 - b7
|
||||
1,1,1,1,1,1,1,1, # b8 - bf
|
||||
1,1,1,1,1,1,1,1, # c0 - c7
|
||||
1,1,1,1,1,1,1,1, # c8 - cf
|
||||
1,1,1,1,1,1,1,1, # d0 - d7
|
||||
1,1,1,1,1,1,1,1, # d8 - df
|
||||
1,1,1,1,1,1,1,1, # e0 - e7
|
||||
1,1,1,1,1,1,1,1, # e8 - ef
|
||||
1,1,1,1,1,1,1,1, # f0 - f7
|
||||
1,1,1,1,1,1,1,1, # f8 - ff
|
||||
1, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 28 - 2f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||
0, 0, 0, 4, 0, 5, 2, 0, # 78 - 7f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 80 - 87
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 88 - 8f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 90 - 97
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 98 - 9f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # a0 - a7
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # a8 - af
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # b0 - b7
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # b8 - bf
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # c0 - c7
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # c8 - cf
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # d0 - d7
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # d8 - df
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # e0 - e7
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # e8 - ef
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # f0 - f7
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # f8 - ff
|
||||
)
|
||||
|
||||
HZ_ST = (
|
||||
MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17
|
||||
5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f
|
||||
4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27
|
||||
4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f
|
||||
MachineState.START, MachineState.ERROR, 3, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, # 00-07
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 08-0f
|
||||
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, 4, MachineState.ERROR, # 10-17
|
||||
5, MachineState.ERROR, 6, MachineState.ERROR, 5, 5, 4, MachineState.ERROR, # 18-1f
|
||||
4, MachineState.ERROR, 4, 4, 4, MachineState.ERROR, 4, MachineState.ERROR, # 20-27
|
||||
4, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 28-2f
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||
|
||||
HZ_SM_MODEL = {'class_table': HZ_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': HZ_ST,
|
||||
'char_len_table': HZ_CHAR_LEN_TABLE,
|
||||
'name': "HZ-GB-2312",
|
||||
'language': 'Chinese'}
|
||||
HZ_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": HZ_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": HZ_ST,
|
||||
"char_len_table": HZ_CHAR_LEN_TABLE,
|
||||
"name": "HZ-GB-2312",
|
||||
"language": "Chinese",
|
||||
}
|
||||
|
||||
# fmt: off
|
||||
ISO2022CN_CLS = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,4,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
|
||||
0, 3, 0, 0, 0, 0, 0, 0, # 28 - 2f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||
0, 0, 0, 4, 0, 0, 0, 0, # 40 - 47
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022CN_ST = (
|
||||
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
|
||||
MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27
|
||||
5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f
|
||||
MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 00-07
|
||||
MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 08-0f
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 10-17
|
||||
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, # 18-1f
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 20-27
|
||||
5, 6, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 28-2f
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 30-37
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, # 38-3f
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
|
||||
ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS,
|
||||
'class_factor': 9,
|
||||
'state_table': ISO2022CN_ST,
|
||||
'char_len_table': ISO2022CN_CHAR_LEN_TABLE,
|
||||
'name': "ISO-2022-CN",
|
||||
'language': 'Chinese'}
|
||||
ISO2022CN_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": ISO2022CN_CLS,
|
||||
"class_factor": 9,
|
||||
"state_table": ISO2022CN_ST,
|
||||
"char_len_table": ISO2022CN_CHAR_LEN_TABLE,
|
||||
"name": "ISO-2022-CN",
|
||||
"language": "Chinese",
|
||||
}
|
||||
|
||||
# fmt: off
|
||||
ISO2022JP_CLS = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,2,2, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,7,0,0,0, # 20 - 27
|
||||
3,0,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
6,0,4,0,8,0,0,0, # 40 - 47
|
||||
0,9,5,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||
0, 0, 0, 0, 0, 0, 2, 2, # 08 - 0f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
|
||||
0, 0, 0, 0, 7, 0, 0, 0, # 20 - 27
|
||||
3, 0, 0, 0, 0, 0, 0, 0, # 28 - 2f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||
6, 0, 4, 0, 8, 0, 0, 0, # 40 - 47
|
||||
0, 9, 5, 0, 0, 0, 0, 0, # 48 - 4f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022JP_ST = (
|
||||
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07
|
||||
MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f
|
||||
MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47
|
||||
MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 00-07
|
||||
MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 08-0f
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 10-17
|
||||
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, # 18-1f
|
||||
MachineState.ERROR, 5, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, MachineState.ERROR, # 20-27
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 6, MachineState.ITS_ME, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, # 28-2f
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, # 30-37
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 38-3f
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ERROR, MachineState.START, MachineState.START, # 40-47
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
|
||||
ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS,
|
||||
'class_factor': 10,
|
||||
'state_table': ISO2022JP_ST,
|
||||
'char_len_table': ISO2022JP_CHAR_LEN_TABLE,
|
||||
'name': "ISO-2022-JP",
|
||||
'language': 'Japanese'}
|
||||
ISO2022JP_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": ISO2022JP_CLS,
|
||||
"class_factor": 10,
|
||||
"state_table": ISO2022JP_ST,
|
||||
"char_len_table": ISO2022JP_CHAR_LEN_TABLE,
|
||||
"name": "ISO-2022-JP",
|
||||
"language": "Japanese",
|
||||
}
|
||||
|
||||
# fmt: off
|
||||
ISO2022KR_CLS = (
|
||||
2,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,0,0,0,0,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,1,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,3,0,0,0, # 20 - 27
|
||||
0,4,0,0,0,0,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,5,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
2,2,2,2,2,2,2,2, # 80 - 87
|
||||
2,2,2,2,2,2,2,2, # 88 - 8f
|
||||
2,2,2,2,2,2,2,2, # 90 - 97
|
||||
2,2,2,2,2,2,2,2, # 98 - 9f
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,2, # f8 - ff
|
||||
2, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 08 - 0f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||
0, 0, 0, 1, 0, 0, 0, 0, # 18 - 1f
|
||||
0, 0, 0, 0, 3, 0, 0, 0, # 20 - 27
|
||||
0, 4, 0, 0, 0, 0, 0, 0, # 28 - 2f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||
0, 0, 0, 5, 0, 0, 0, 0, # 40 - 47
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 80 - 87
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 88 - 8f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 90 - 97
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 98 - 9f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # f8 - ff
|
||||
)
|
||||
|
||||
ISO2022KR_ST = (
|
||||
MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27
|
||||
MachineState.START, 3, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, # 00-07
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ITS_ME, # 08-0f
|
||||
MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 4, MachineState.ERROR, MachineState.ERROR, # 10-17
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, 5, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 18-1f
|
||||
MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ITS_ME, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 20-27
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||
|
||||
ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': ISO2022KR_ST,
|
||||
'char_len_table': ISO2022KR_CHAR_LEN_TABLE,
|
||||
'name': "ISO-2022-KR",
|
||||
'language': 'Korean'}
|
||||
|
||||
|
||||
ISO2022KR_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": ISO2022KR_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": ISO2022KR_ST,
|
||||
"char_len_table": ISO2022KR_CHAR_LEN_TABLE,
|
||||
"name": "ISO-2022-KR",
|
||||
"language": "Korean",
|
||||
}
|
||||
|
||||
@@ -25,68 +25,78 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .enums import ProbingState, MachineState
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from typing import Union
|
||||
|
||||
from .chardistribution import EUCJPDistributionAnalysis
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import MachineState, ProbingState
|
||||
from .jpcntx import EUCJPContextAnalysis
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .mbcssm import EUCJP_SM_MODEL
|
||||
|
||||
|
||||
class EUCJPProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
super(EUCJPProber, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
|
||||
self.distribution_analyzer = EUCJPDistributionAnalysis()
|
||||
self.context_analyzer = EUCJPContextAnalysis()
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
super(EUCJPProber, self).reset()
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self.context_analyzer.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "EUC-JP"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Japanese"
|
||||
|
||||
def feed(self, byte_str):
|
||||
for i in range(len(byte_str)):
|
||||
# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte
|
||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
assert self.coding_sm is not None
|
||||
assert self.distribution_analyzer is not None
|
||||
|
||||
for i, byte in enumerate(byte_str):
|
||||
# PY3K: byte_str is a byte array, so byte is an int, not a byte
|
||||
coding_state = self.coding_sm.next_state(byte)
|
||||
if coding_state == MachineState.ERROR:
|
||||
self.logger.debug('%s %s prober hit error at byte %s',
|
||||
self.charset_name, self.language, i)
|
||||
self.logger.debug(
|
||||
"%s %s prober hit error at byte %s",
|
||||
self.charset_name,
|
||||
self.language,
|
||||
i,
|
||||
)
|
||||
self._state = ProbingState.NOT_ME
|
||||
break
|
||||
elif coding_state == MachineState.ITS_ME:
|
||||
if coding_state == MachineState.ITS_ME:
|
||||
self._state = ProbingState.FOUND_IT
|
||||
break
|
||||
elif coding_state == MachineState.START:
|
||||
if coding_state == MachineState.START:
|
||||
char_len = self.coding_sm.get_current_charlen()
|
||||
if i == 0:
|
||||
self._last_char[1] = byte_str[0]
|
||||
self._last_char[1] = byte
|
||||
self.context_analyzer.feed(self._last_char, char_len)
|
||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||
else:
|
||||
self.context_analyzer.feed(byte_str[i - 1:i + 1],
|
||||
char_len)
|
||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
||||
char_len)
|
||||
self.context_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||
|
||||
self._last_char[0] = byte_str[-1]
|
||||
|
||||
if self.state == ProbingState.DETECTING:
|
||||
if (self.context_analyzer.got_enough_data() and
|
||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
||||
if self.context_analyzer.got_enough_data() and (
|
||||
self.get_confidence() > self.SHORTCUT_THRESHOLD
|
||||
):
|
||||
self._state = ProbingState.FOUND_IT
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
assert self.distribution_analyzer is not None
|
||||
|
||||
context_conf = self.context_analyzer.get_confidence()
|
||||
distrib_conf = self.distribution_analyzer.get_confidence()
|
||||
return max(context_conf, distrib_conf)
|
||||
|
||||
@@ -43,6 +43,7 @@ EUCKR_TYPICAL_DISTRIBUTION_RATIO = 6.0
|
||||
EUCKR_TABLE_SIZE = 2352
|
||||
|
||||
# Char to FreqOrder table ,
|
||||
# fmt: off
|
||||
EUCKR_CHAR_TO_FREQ_ORDER = (
|
||||
13, 130, 120,1396, 481,1719,1720, 328, 609, 212,1721, 707, 400, 299,1722, 87,
|
||||
1397,1723, 104, 536,1117,1203,1724,1267, 685,1268, 508,1725,1726,1727,1728,1398,
|
||||
@@ -192,4 +193,4 @@ EUCKR_CHAR_TO_FREQ_ORDER = (
|
||||
2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042,
|
||||
670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, # 512, 256
|
||||
)
|
||||
|
||||
# fmt: on
|
||||
|
||||
@@ -25,23 +25,23 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCKRDistributionAnalysis
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .mbcssm import EUCKR_SM_MODEL
|
||||
|
||||
|
||||
class EUCKRProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
super(EUCKRProber, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
|
||||
self.distribution_analyzer = EUCKRDistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "EUC-KR"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Korean"
|
||||
|
||||
@@ -43,345 +43,346 @@
|
||||
|
||||
EUCTW_TYPICAL_DISTRIBUTION_RATIO = 0.75
|
||||
|
||||
# Char to FreqOrder table ,
|
||||
# Char to FreqOrder table
|
||||
EUCTW_TABLE_SIZE = 5376
|
||||
|
||||
# fmt: off
|
||||
EUCTW_CHAR_TO_FREQ_ORDER = (
|
||||
1,1800,1506, 255,1431, 198, 9, 82, 6,7310, 177, 202,3615,1256,2808, 110, # 2742
|
||||
3735, 33,3241, 261, 76, 44,2113, 16,2931,2184,1176, 659,3868, 26,3404,2643, # 2758
|
||||
1198,3869,3313,4060, 410,2211, 302, 590, 361,1963, 8, 204, 58,4296,7311,1931, # 2774
|
||||
63,7312,7313, 317,1614, 75, 222, 159,4061,2412,1480,7314,3500,3068, 224,2809, # 2790
|
||||
3616, 3, 10,3870,1471, 29,2774,1135,2852,1939, 873, 130,3242,1123, 312,7315, # 2806
|
||||
4297,2051, 507, 252, 682,7316, 142,1914, 124, 206,2932, 34,3501,3173, 64, 604, # 2822
|
||||
7317,2494,1976,1977, 155,1990, 645, 641,1606,7318,3405, 337, 72, 406,7319, 80, # 2838
|
||||
630, 238,3174,1509, 263, 939,1092,2644, 756,1440,1094,3406, 449, 69,2969, 591, # 2854
|
||||
179,2095, 471, 115,2034,1843, 60, 50,2970, 134, 806,1868, 734,2035,3407, 180, # 2870
|
||||
995,1607, 156, 537,2893, 688,7320, 319,1305, 779,2144, 514,2374, 298,4298, 359, # 2886
|
||||
2495, 90,2707,1338, 663, 11, 906,1099,2545, 20,2436, 182, 532,1716,7321, 732, # 2902
|
||||
1376,4062,1311,1420,3175, 25,2312,1056, 113, 399, 382,1949, 242,3408,2467, 529, # 2918
|
||||
3243, 475,1447,3617,7322, 117, 21, 656, 810,1297,2295,2329,3502,7323, 126,4063, # 2934
|
||||
706, 456, 150, 613,4299, 71,1118,2036,4064, 145,3069, 85, 835, 486,2114,1246, # 2950
|
||||
1426, 428, 727,1285,1015, 800, 106, 623, 303,1281,7324,2127,2354, 347,3736, 221, # 2966
|
||||
3503,3110,7325,1955,1153,4065, 83, 296,1199,3070, 192, 624, 93,7326, 822,1897, # 2982
|
||||
2810,3111, 795,2064, 991,1554,1542,1592, 27, 43,2853, 859, 139,1456, 860,4300, # 2998
|
||||
437, 712,3871, 164,2392,3112, 695, 211,3017,2096, 195,3872,1608,3504,3505,3618, # 3014
|
||||
3873, 234, 811,2971,2097,3874,2229,1441,3506,1615,2375, 668,2076,1638, 305, 228, # 3030
|
||||
1664,4301, 467, 415,7327, 262,2098,1593, 239, 108, 300, 200,1033, 512,1247,2077, # 3046
|
||||
7328,7329,2173,3176,3619,2673, 593, 845,1062,3244, 88,1723,2037,3875,1950, 212, # 3062
|
||||
266, 152, 149, 468,1898,4066,4302, 77, 187,7330,3018, 37, 5,2972,7331,3876, # 3078
|
||||
7332,7333, 39,2517,4303,2894,3177,2078, 55, 148, 74,4304, 545, 483,1474,1029, # 3094
|
||||
1665, 217,1869,1531,3113,1104,2645,4067, 24, 172,3507, 900,3877,3508,3509,4305, # 3110
|
||||
32,1408,2811,1312, 329, 487,2355,2247,2708, 784,2674, 4,3019,3314,1427,1788, # 3126
|
||||
188, 109, 499,7334,3620,1717,1789, 888,1217,3020,4306,7335,3510,7336,3315,1520, # 3142
|
||||
3621,3878, 196,1034, 775,7337,7338, 929,1815, 249, 439, 38,7339,1063,7340, 794, # 3158
|
||||
3879,1435,2296, 46, 178,3245,2065,7341,2376,7342, 214,1709,4307, 804, 35, 707, # 3174
|
||||
324,3622,1601,2546, 140, 459,4068,7343,7344,1365, 839, 272, 978,2257,2572,3409, # 3190
|
||||
2128,1363,3623,1423, 697, 100,3071, 48, 70,1231, 495,3114,2193,7345,1294,7346, # 3206
|
||||
2079, 462, 586,1042,3246, 853, 256, 988, 185,2377,3410,1698, 434,1084,7347,3411, # 3222
|
||||
314,2615,2775,4308,2330,2331, 569,2280, 637,1816,2518, 757,1162,1878,1616,3412, # 3238
|
||||
287,1577,2115, 768,4309,1671,2854,3511,2519,1321,3737, 909,2413,7348,4069, 933, # 3254
|
||||
3738,7349,2052,2356,1222,4310, 765,2414,1322, 786,4311,7350,1919,1462,1677,2895, # 3270
|
||||
1699,7351,4312,1424,2437,3115,3624,2590,3316,1774,1940,3413,3880,4070, 309,1369, # 3286
|
||||
1130,2812, 364,2230,1653,1299,3881,3512,3882,3883,2646, 525,1085,3021, 902,2000, # 3302
|
||||
1475, 964,4313, 421,1844,1415,1057,2281, 940,1364,3116, 376,4314,4315,1381, 7, # 3318
|
||||
2520, 983,2378, 336,1710,2675,1845, 321,3414, 559,1131,3022,2742,1808,1132,1313, # 3334
|
||||
265,1481,1857,7352, 352,1203,2813,3247, 167,1089, 420,2814, 776, 792,1724,3513, # 3350
|
||||
4071,2438,3248,7353,4072,7354, 446, 229, 333,2743, 901,3739,1200,1557,4316,2647, # 3366
|
||||
1920, 395,2744,2676,3740,4073,1835, 125, 916,3178,2616,4317,7355,7356,3741,7357, # 3382
|
||||
7358,7359,4318,3117,3625,1133,2547,1757,3415,1510,2313,1409,3514,7360,2145, 438, # 3398
|
||||
2591,2896,2379,3317,1068, 958,3023, 461, 311,2855,2677,4074,1915,3179,4075,1978, # 3414
|
||||
383, 750,2745,2617,4076, 274, 539, 385,1278,1442,7361,1154,1964, 384, 561, 210, # 3430
|
||||
98,1295,2548,3515,7362,1711,2415,1482,3416,3884,2897,1257, 129,7363,3742, 642, # 3446
|
||||
523,2776,2777,2648,7364, 141,2231,1333, 68, 176, 441, 876, 907,4077, 603,2592, # 3462
|
||||
710, 171,3417, 404, 549, 18,3118,2393,1410,3626,1666,7365,3516,4319,2898,4320, # 3478
|
||||
7366,2973, 368,7367, 146, 366, 99, 871,3627,1543, 748, 807,1586,1185, 22,2258, # 3494
|
||||
379,3743,3180,7368,3181, 505,1941,2618,1991,1382,2314,7369, 380,2357, 218, 702, # 3510
|
||||
1817,1248,3418,3024,3517,3318,3249,7370,2974,3628, 930,3250,3744,7371, 59,7372, # 3526
|
||||
585, 601,4078, 497,3419,1112,1314,4321,1801,7373,1223,1472,2174,7374, 749,1836, # 3542
|
||||
690,1899,3745,1772,3885,1476, 429,1043,1790,2232,2116, 917,4079, 447,1086,1629, # 3558
|
||||
7375, 556,7376,7377,2020,1654, 844,1090, 105, 550, 966,1758,2815,1008,1782, 686, # 3574
|
||||
1095,7378,2282, 793,1602,7379,3518,2593,4322,4080,2933,2297,4323,3746, 980,2496, # 3590
|
||||
544, 353, 527,4324, 908,2678,2899,7380, 381,2619,1942,1348,7381,1341,1252, 560, # 3606
|
||||
3072,7382,3420,2856,7383,2053, 973, 886,2080, 143,4325,7384,7385, 157,3886, 496, # 3622
|
||||
4081, 57, 840, 540,2038,4326,4327,3421,2117,1445, 970,2259,1748,1965,2081,4082, # 3638
|
||||
3119,1234,1775,3251,2816,3629, 773,1206,2129,1066,2039,1326,3887,1738,1725,4083, # 3654
|
||||
279,3120, 51,1544,2594, 423,1578,2130,2066, 173,4328,1879,7386,7387,1583, 264, # 3670
|
||||
610,3630,4329,2439, 280, 154,7388,7389,7390,1739, 338,1282,3073, 693,2857,1411, # 3686
|
||||
1074,3747,2440,7391,4330,7392,7393,1240, 952,2394,7394,2900,1538,2679, 685,1483, # 3702
|
||||
4084,2468,1436, 953,4085,2054,4331, 671,2395, 79,4086,2441,3252, 608, 567,2680, # 3718
|
||||
3422,4087,4088,1691, 393,1261,1791,2396,7395,4332,7396,7397,7398,7399,1383,1672, # 3734
|
||||
3748,3182,1464, 522,1119, 661,1150, 216, 675,4333,3888,1432,3519, 609,4334,2681, # 3750
|
||||
2397,7400,7401,7402,4089,3025, 0,7403,2469, 315, 231,2442, 301,3319,4335,2380, # 3766
|
||||
7404, 233,4090,3631,1818,4336,4337,7405, 96,1776,1315,2082,7406, 257,7407,1809, # 3782
|
||||
3632,2709,1139,1819,4091,2021,1124,2163,2778,1777,2649,7408,3074, 363,1655,3183, # 3798
|
||||
7409,2975,7410,7411,7412,3889,1567,3890, 718, 103,3184, 849,1443, 341,3320,2934, # 3814
|
||||
1484,7413,1712, 127, 67, 339,4092,2398, 679,1412, 821,7414,7415, 834, 738, 351, # 3830
|
||||
2976,2146, 846, 235,1497,1880, 418,1992,3749,2710, 186,1100,2147,2746,3520,1545, # 3846
|
||||
1355,2935,2858,1377, 583,3891,4093,2573,2977,7416,1298,3633,1078,2549,3634,2358, # 3862
|
||||
78,3750,3751, 267,1289,2099,2001,1594,4094, 348, 369,1274,2194,2175,1837,4338, # 3878
|
||||
1820,2817,3635,2747,2283,2002,4339,2936,2748, 144,3321, 882,4340,3892,2749,3423, # 3894
|
||||
4341,2901,7417,4095,1726, 320,7418,3893,3026, 788,2978,7419,2818,1773,1327,2859, # 3910
|
||||
3894,2819,7420,1306,4342,2003,1700,3752,3521,2359,2650, 787,2022, 506, 824,3636, # 3926
|
||||
534, 323,4343,1044,3322,2023,1900, 946,3424,7421,1778,1500,1678,7422,1881,4344, # 3942
|
||||
165, 243,4345,3637,2521, 123, 683,4096, 764,4346, 36,3895,1792, 589,2902, 816, # 3958
|
||||
626,1667,3027,2233,1639,1555,1622,3753,3896,7423,3897,2860,1370,1228,1932, 891, # 3974
|
||||
2083,2903, 304,4097,7424, 292,2979,2711,3522, 691,2100,4098,1115,4347, 118, 662, # 3990
|
||||
7425, 611,1156, 854,2381,1316,2861, 2, 386, 515,2904,7426,7427,3253, 868,2234, # 4006
|
||||
1486, 855,2651, 785,2212,3028,7428,1040,3185,3523,7429,3121, 448,7430,1525,7431, # 4022
|
||||
2164,4348,7432,3754,7433,4099,2820,3524,3122, 503, 818,3898,3123,1568, 814, 676, # 4038
|
||||
1444, 306,1749,7434,3755,1416,1030, 197,1428, 805,2821,1501,4349,7435,7436,7437, # 4054
|
||||
1993,7438,4350,7439,7440,2195, 13,2779,3638,2980,3124,1229,1916,7441,3756,2131, # 4070
|
||||
7442,4100,4351,2399,3525,7443,2213,1511,1727,1120,7444,7445, 646,3757,2443, 307, # 4086
|
||||
7446,7447,1595,3186,7448,7449,7450,3639,1113,1356,3899,1465,2522,2523,7451, 519, # 4102
|
||||
7452, 128,2132, 92,2284,1979,7453,3900,1512, 342,3125,2196,7454,2780,2214,1980, # 4118
|
||||
3323,7455, 290,1656,1317, 789, 827,2360,7456,3758,4352, 562, 581,3901,7457, 401, # 4134
|
||||
4353,2248, 94,4354,1399,2781,7458,1463,2024,4355,3187,1943,7459, 828,1105,4101, # 4150
|
||||
1262,1394,7460,4102, 605,4356,7461,1783,2862,7462,2822, 819,2101, 578,2197,2937, # 4166
|
||||
7463,1502, 436,3254,4103,3255,2823,3902,2905,3425,3426,7464,2712,2315,7465,7466, # 4182
|
||||
2332,2067, 23,4357, 193, 826,3759,2102, 699,1630,4104,3075, 390,1793,1064,3526, # 4198
|
||||
7467,1579,3076,3077,1400,7468,4105,1838,1640,2863,7469,4358,4359, 137,4106, 598, # 4214
|
||||
3078,1966, 780, 104, 974,2938,7470, 278, 899, 253, 402, 572, 504, 493,1339,7471, # 4230
|
||||
3903,1275,4360,2574,2550,7472,3640,3029,3079,2249, 565,1334,2713, 863, 41,7473, # 4246
|
||||
7474,4361,7475,1657,2333, 19, 463,2750,4107, 606,7476,2981,3256,1087,2084,1323, # 4262
|
||||
2652,2982,7477,1631,1623,1750,4108,2682,7478,2864, 791,2714,2653,2334, 232,2416, # 4278
|
||||
7479,2983,1498,7480,2654,2620, 755,1366,3641,3257,3126,2025,1609, 119,1917,3427, # 4294
|
||||
862,1026,4109,7481,3904,3760,4362,3905,4363,2260,1951,2470,7482,1125, 817,4110, # 4310
|
||||
4111,3906,1513,1766,2040,1487,4112,3030,3258,2824,3761,3127,7483,7484,1507,7485, # 4326
|
||||
2683, 733, 40,1632,1106,2865, 345,4113, 841,2524, 230,4364,2984,1846,3259,3428, # 4342
|
||||
7486,1263, 986,3429,7487, 735, 879, 254,1137, 857, 622,1300,1180,1388,1562,3907, # 4358
|
||||
3908,2939, 967,2751,2655,1349, 592,2133,1692,3324,2985,1994,4114,1679,3909,1901, # 4374
|
||||
2185,7488, 739,3642,2715,1296,1290,7489,4115,2198,2199,1921,1563,2595,2551,1870, # 4390
|
||||
2752,2986,7490, 435,7491, 343,1108, 596, 17,1751,4365,2235,3430,3643,7492,4366, # 4406
|
||||
294,3527,2940,1693, 477, 979, 281,2041,3528, 643,2042,3644,2621,2782,2261,1031, # 4422
|
||||
2335,2134,2298,3529,4367, 367,1249,2552,7493,3530,7494,4368,1283,3325,2004, 240, # 4438
|
||||
1762,3326,4369,4370, 836,1069,3128, 474,7495,2148,2525, 268,3531,7496,3188,1521, # 4454
|
||||
1284,7497,1658,1546,4116,7498,3532,3533,7499,4117,3327,2684,1685,4118, 961,1673, # 4470
|
||||
2622, 190,2005,2200,3762,4371,4372,7500, 570,2497,3645,1490,7501,4373,2623,3260, # 4486
|
||||
1956,4374, 584,1514, 396,1045,1944,7502,4375,1967,2444,7503,7504,4376,3910, 619, # 4502
|
||||
7505,3129,3261, 215,2006,2783,2553,3189,4377,3190,4378, 763,4119,3763,4379,7506, # 4518
|
||||
7507,1957,1767,2941,3328,3646,1174, 452,1477,4380,3329,3130,7508,2825,1253,2382, # 4534
|
||||
2186,1091,2285,4120, 492,7509, 638,1169,1824,2135,1752,3911, 648, 926,1021,1324, # 4550
|
||||
4381, 520,4382, 997, 847,1007, 892,4383,3764,2262,1871,3647,7510,2400,1784,4384, # 4566
|
||||
1952,2942,3080,3191,1728,4121,2043,3648,4385,2007,1701,3131,1551, 30,2263,4122, # 4582
|
||||
7511,2026,4386,3534,7512, 501,7513,4123, 594,3431,2165,1821,3535,3432,3536,3192, # 4598
|
||||
829,2826,4124,7514,1680,3132,1225,4125,7515,3262,4387,4126,3133,2336,7516,4388, # 4614
|
||||
4127,7517,3912,3913,7518,1847,2383,2596,3330,7519,4389, 374,3914, 652,4128,4129, # 4630
|
||||
375,1140, 798,7520,7521,7522,2361,4390,2264, 546,1659, 138,3031,2445,4391,7523, # 4646
|
||||
2250, 612,1848, 910, 796,3765,1740,1371, 825,3766,3767,7524,2906,2554,7525, 692, # 4662
|
||||
444,3032,2624, 801,4392,4130,7526,1491, 244,1053,3033,4131,4132, 340,7527,3915, # 4678
|
||||
1041,2987, 293,1168, 87,1357,7528,1539, 959,7529,2236, 721, 694,4133,3768, 219, # 4694
|
||||
1478, 644,1417,3331,2656,1413,1401,1335,1389,3916,7530,7531,2988,2362,3134,1825, # 4710
|
||||
730,1515, 184,2827, 66,4393,7532,1660,2943, 246,3332, 378,1457, 226,3433, 975, # 4726
|
||||
3917,2944,1264,3537, 674, 696,7533, 163,7534,1141,2417,2166, 713,3538,3333,4394, # 4742
|
||||
3918,7535,7536,1186, 15,7537,1079,1070,7538,1522,3193,3539, 276,1050,2716, 758, # 4758
|
||||
1126, 653,2945,3263,7539,2337, 889,3540,3919,3081,2989, 903,1250,4395,3920,3434, # 4774
|
||||
3541,1342,1681,1718, 766,3264, 286, 89,2946,3649,7540,1713,7541,2597,3334,2990, # 4790
|
||||
7542,2947,2215,3194,2866,7543,4396,2498,2526, 181, 387,1075,3921, 731,2187,3335, # 4806
|
||||
7544,3265, 310, 313,3435,2299, 770,4134, 54,3034, 189,4397,3082,3769,3922,7545, # 4822
|
||||
1230,1617,1849, 355,3542,4135,4398,3336, 111,4136,3650,1350,3135,3436,3035,4137, # 4838
|
||||
2149,3266,3543,7546,2784,3923,3924,2991, 722,2008,7547,1071, 247,1207,2338,2471, # 4854
|
||||
1378,4399,2009, 864,1437,1214,4400, 373,3770,1142,2216, 667,4401, 442,2753,2555, # 4870
|
||||
3771,3925,1968,4138,3267,1839, 837, 170,1107, 934,1336,1882,7548,7549,2118,4139, # 4886
|
||||
2828, 743,1569,7550,4402,4140, 582,2384,1418,3437,7551,1802,7552, 357,1395,1729, # 4902
|
||||
3651,3268,2418,1564,2237,7553,3083,3772,1633,4403,1114,2085,4141,1532,7554, 482, # 4918
|
||||
2446,4404,7555,7556,1492, 833,1466,7557,2717,3544,1641,2829,7558,1526,1272,3652, # 4934
|
||||
4142,1686,1794, 416,2556,1902,1953,1803,7559,3773,2785,3774,1159,2316,7560,2867, # 4950
|
||||
4405,1610,1584,3036,2419,2754, 443,3269,1163,3136,7561,7562,3926,7563,4143,2499, # 4966
|
||||
3037,4406,3927,3137,2103,1647,3545,2010,1872,4144,7564,4145, 431,3438,7565, 250, # 4982
|
||||
97, 81,4146,7566,1648,1850,1558, 160, 848,7567, 866, 740,1694,7568,2201,2830, # 4998
|
||||
3195,4147,4407,3653,1687, 950,2472, 426, 469,3196,3654,3655,3928,7569,7570,1188, # 5014
|
||||
424,1995, 861,3546,4148,3775,2202,2685, 168,1235,3547,4149,7571,2086,1674,4408, # 5030
|
||||
3337,3270, 220,2557,1009,7572,3776, 670,2992, 332,1208, 717,7573,7574,3548,2447, # 5046
|
||||
3929,3338,7575, 513,7576,1209,2868,3339,3138,4409,1080,7577,7578,7579,7580,2527, # 5062
|
||||
3656,3549, 815,1587,3930,3931,7581,3550,3439,3777,1254,4410,1328,3038,1390,3932, # 5078
|
||||
1741,3933,3778,3934,7582, 236,3779,2448,3271,7583,7584,3657,3780,1273,3781,4411, # 5094
|
||||
7585, 308,7586,4412, 245,4413,1851,2473,1307,2575, 430, 715,2136,2449,7587, 270, # 5110
|
||||
199,2869,3935,7588,3551,2718,1753, 761,1754, 725,1661,1840,4414,3440,3658,7589, # 5126
|
||||
7590, 587, 14,3272, 227,2598, 326, 480,2265, 943,2755,3552, 291, 650,1883,7591, # 5142
|
||||
1702,1226, 102,1547, 62,3441, 904,4415,3442,1164,4150,7592,7593,1224,1548,2756, # 5158
|
||||
391, 498,1493,7594,1386,1419,7595,2055,1177,4416, 813, 880,1081,2363, 566,1145, # 5174
|
||||
4417,2286,1001,1035,2558,2599,2238, 394,1286,7596,7597,2068,7598, 86,1494,1730, # 5190
|
||||
3936, 491,1588, 745, 897,2948, 843,3340,3937,2757,2870,3273,1768, 998,2217,2069, # 5206
|
||||
397,1826,1195,1969,3659,2993,3341, 284,7599,3782,2500,2137,2119,1903,7600,3938, # 5222
|
||||
2150,3939,4151,1036,3443,1904, 114,2559,4152, 209,1527,7601,7602,2949,2831,2625, # 5238
|
||||
2385,2719,3139, 812,2560,7603,3274,7604,1559, 737,1884,3660,1210, 885, 28,2686, # 5254
|
||||
3553,3783,7605,4153,1004,1779,4418,7606, 346,1981,2218,2687,4419,3784,1742, 797, # 5270
|
||||
1642,3940,1933,1072,1384,2151, 896,3941,3275,3661,3197,2871,3554,7607,2561,1958, # 5286
|
||||
4420,2450,1785,7608,7609,7610,3942,4154,1005,1308,3662,4155,2720,4421,4422,1528, # 5302
|
||||
2600, 161,1178,4156,1982, 987,4423,1101,4157, 631,3943,1157,3198,2420,1343,1241, # 5318
|
||||
1016,2239,2562, 372, 877,2339,2501,1160, 555,1934, 911,3944,7611, 466,1170, 169, # 5334
|
||||
1051,2907,2688,3663,2474,2994,1182,2011,2563,1251,2626,7612, 992,2340,3444,1540, # 5350
|
||||
2721,1201,2070,2401,1996,2475,7613,4424, 528,1922,2188,1503,1873,1570,2364,3342, # 5366
|
||||
3276,7614, 557,1073,7615,1827,3445,2087,2266,3140,3039,3084, 767,3085,2786,4425, # 5382
|
||||
1006,4158,4426,2341,1267,2176,3664,3199, 778,3945,3200,2722,1597,2657,7616,4427, # 5398
|
||||
7617,3446,7618,7619,7620,3277,2689,1433,3278, 131, 95,1504,3946, 723,4159,3141, # 5414
|
||||
1841,3555,2758,2189,3947,2027,2104,3665,7621,2995,3948,1218,7622,3343,3201,3949, # 5430
|
||||
4160,2576, 248,1634,3785, 912,7623,2832,3666,3040,3786, 654, 53,7624,2996,7625, # 5446
|
||||
1688,4428, 777,3447,1032,3950,1425,7626, 191, 820,2120,2833, 971,4429, 931,3202, # 5462
|
||||
135, 664, 783,3787,1997, 772,2908,1935,3951,3788,4430,2909,3203, 282,2723, 640, # 5478
|
||||
1372,3448,1127, 922, 325,3344,7627,7628, 711,2044,7629,7630,3952,2219,2787,1936, # 5494
|
||||
3953,3345,2220,2251,3789,2300,7631,4431,3790,1258,3279,3954,3204,2138,2950,3955, # 5510
|
||||
3956,7632,2221, 258,3205,4432, 101,1227,7633,3280,1755,7634,1391,3281,7635,2910, # 5526
|
||||
2056, 893,7636,7637,7638,1402,4161,2342,7639,7640,3206,3556,7641,7642, 878,1325, # 5542
|
||||
1780,2788,4433, 259,1385,2577, 744,1183,2267,4434,7643,3957,2502,7644, 684,1024, # 5558
|
||||
4162,7645, 472,3557,3449,1165,3282,3958,3959, 322,2152, 881, 455,1695,1152,1340, # 5574
|
||||
660, 554,2153,4435,1058,4436,4163, 830,1065,3346,3960,4437,1923,7646,1703,1918, # 5590
|
||||
7647, 932,2268, 122,7648,4438, 947, 677,7649,3791,2627, 297,1905,1924,2269,4439, # 5606
|
||||
2317,3283,7650,7651,4164,7652,4165, 84,4166, 112, 989,7653, 547,1059,3961, 701, # 5622
|
||||
3558,1019,7654,4167,7655,3450, 942, 639, 457,2301,2451, 993,2951, 407, 851, 494, # 5638
|
||||
4440,3347, 927,7656,1237,7657,2421,3348, 573,4168, 680, 921,2911,1279,1874, 285, # 5654
|
||||
790,1448,1983, 719,2167,7658,7659,4441,3962,3963,1649,7660,1541, 563,7661,1077, # 5670
|
||||
7662,3349,3041,3451, 511,2997,3964,3965,3667,3966,1268,2564,3350,3207,4442,4443, # 5686
|
||||
7663, 535,1048,1276,1189,2912,2028,3142,1438,1373,2834,2952,1134,2012,7664,4169, # 5702
|
||||
1238,2578,3086,1259,7665, 700,7666,2953,3143,3668,4170,7667,4171,1146,1875,1906, # 5718
|
||||
4444,2601,3967, 781,2422, 132,1589, 203, 147, 273,2789,2402, 898,1786,2154,3968, # 5734
|
||||
3969,7668,3792,2790,7669,7670,4445,4446,7671,3208,7672,1635,3793, 965,7673,1804, # 5750
|
||||
2690,1516,3559,1121,1082,1329,3284,3970,1449,3794, 65,1128,2835,2913,2759,1590, # 5766
|
||||
3795,7674,7675, 12,2658, 45, 976,2579,3144,4447, 517,2528,1013,1037,3209,7676, # 5782
|
||||
3796,2836,7677,3797,7678,3452,7679,2602, 614,1998,2318,3798,3087,2724,2628,7680, # 5798
|
||||
2580,4172, 599,1269,7681,1810,3669,7682,2691,3088, 759,1060, 489,1805,3351,3285, # 5814
|
||||
1358,7683,7684,2386,1387,1215,2629,2252, 490,7685,7686,4173,1759,2387,2343,7687, # 5830
|
||||
4448,3799,1907,3971,2630,1806,3210,4449,3453,3286,2760,2344, 874,7688,7689,3454, # 5846
|
||||
3670,1858, 91,2914,3671,3042,3800,4450,7690,3145,3972,2659,7691,3455,1202,1403, # 5862
|
||||
3801,2954,2529,1517,2503,4451,3456,2504,7692,4452,7693,2692,1885,1495,1731,3973, # 5878
|
||||
2365,4453,7694,2029,7695,7696,3974,2693,1216, 237,2581,4174,2319,3975,3802,4454, # 5894
|
||||
4455,2694,3560,3457, 445,4456,7697,7698,7699,7700,2761, 61,3976,3672,1822,3977, # 5910
|
||||
7701, 687,2045, 935, 925, 405,2660, 703,1096,1859,2725,4457,3978,1876,1367,2695, # 5926
|
||||
3352, 918,2105,1781,2476, 334,3287,1611,1093,4458, 564,3146,3458,3673,3353, 945, # 5942
|
||||
2631,2057,4459,7702,1925, 872,4175,7703,3459,2696,3089, 349,4176,3674,3979,4460, # 5958
|
||||
3803,4177,3675,2155,3980,4461,4462,4178,4463,2403,2046, 782,3981, 400, 251,4179, # 5974
|
||||
1624,7704,7705, 277,3676, 299,1265, 476,1191,3804,2121,4180,4181,1109, 205,7706, # 5990
|
||||
2582,1000,2156,3561,1860,7707,7708,7709,4464,7710,4465,2565, 107,2477,2157,3982, # 6006
|
||||
3460,3147,7711,1533, 541,1301, 158, 753,4182,2872,3562,7712,1696, 370,1088,4183, # 6022
|
||||
4466,3563, 579, 327, 440, 162,2240, 269,1937,1374,3461, 968,3043, 56,1396,3090, # 6038
|
||||
2106,3288,3354,7713,1926,2158,4467,2998,7714,3564,7715,7716,3677,4468,2478,7717, # 6054
|
||||
2791,7718,1650,4469,7719,2603,7720,7721,3983,2661,3355,1149,3356,3984,3805,3985, # 6070
|
||||
7722,1076, 49,7723, 951,3211,3289,3290, 450,2837, 920,7724,1811,2792,2366,4184, # 6086
|
||||
1908,1138,2367,3806,3462,7725,3212,4470,1909,1147,1518,2423,4471,3807,7726,4472, # 6102
|
||||
2388,2604, 260,1795,3213,7727,7728,3808,3291, 708,7729,3565,1704,7730,3566,1351, # 6118
|
||||
1618,3357,2999,1886, 944,4185,3358,4186,3044,3359,4187,7731,3678, 422, 413,1714, # 6134
|
||||
3292, 500,2058,2345,4188,2479,7732,1344,1910, 954,7733,1668,7734,7735,3986,2404, # 6150
|
||||
4189,3567,3809,4190,7736,2302,1318,2505,3091, 133,3092,2873,4473, 629, 31,2838, # 6166
|
||||
2697,3810,4474, 850, 949,4475,3987,2955,1732,2088,4191,1496,1852,7737,3988, 620, # 6182
|
||||
3214, 981,1242,3679,3360,1619,3680,1643,3293,2139,2452,1970,1719,3463,2168,7738, # 6198
|
||||
3215,7739,7740,3361,1828,7741,1277,4476,1565,2047,7742,1636,3568,3093,7743, 869, # 6214
|
||||
2839, 655,3811,3812,3094,3989,3000,3813,1310,3569,4477,7744,7745,7746,1733, 558, # 6230
|
||||
4478,3681, 335,1549,3045,1756,4192,3682,1945,3464,1829,1291,1192, 470,2726,2107, # 6246
|
||||
2793, 913,1054,3990,7747,1027,7748,3046,3991,4479, 982,2662,3362,3148,3465,3216, # 6262
|
||||
3217,1946,2794,7749, 571,4480,7750,1830,7751,3570,2583,1523,2424,7752,2089, 984, # 6278
|
||||
4481,3683,1959,7753,3684, 852, 923,2795,3466,3685, 969,1519, 999,2048,2320,1705, # 6294
|
||||
7754,3095, 615,1662, 151, 597,3992,2405,2321,1049, 275,4482,3686,4193, 568,3687, # 6310
|
||||
3571,2480,4194,3688,7755,2425,2270, 409,3218,7756,1566,2874,3467,1002, 769,2840, # 6326
|
||||
194,2090,3149,3689,2222,3294,4195, 628,1505,7757,7758,1763,2177,3001,3993, 521, # 6342
|
||||
1161,2584,1787,2203,2406,4483,3994,1625,4196,4197, 412, 42,3096, 464,7759,2632, # 6358
|
||||
4484,3363,1760,1571,2875,3468,2530,1219,2204,3814,2633,2140,2368,4485,4486,3295, # 6374
|
||||
1651,3364,3572,7760,7761,3573,2481,3469,7762,3690,7763,7764,2271,2091, 460,7765, # 6390
|
||||
4487,7766,3002, 962, 588,3574, 289,3219,2634,1116, 52,7767,3047,1796,7768,7769, # 6406
|
||||
7770,1467,7771,1598,1143,3691,4198,1984,1734,1067,4488,1280,3365, 465,4489,1572, # 6422
|
||||
510,7772,1927,2241,1812,1644,3575,7773,4490,3692,7774,7775,2663,1573,1534,7776, # 6438
|
||||
7777,4199, 536,1807,1761,3470,3815,3150,2635,7778,7779,7780,4491,3471,2915,1911, # 6454
|
||||
2796,7781,3296,1122, 377,3220,7782, 360,7783,7784,4200,1529, 551,7785,2059,3693, # 6470
|
||||
1769,2426,7786,2916,4201,3297,3097,2322,2108,2030,4492,1404, 136,1468,1479, 672, # 6486
|
||||
1171,3221,2303, 271,3151,7787,2762,7788,2049, 678,2727, 865,1947,4493,7789,2013, # 6502
|
||||
3995,2956,7790,2728,2223,1397,3048,3694,4494,4495,1735,2917,3366,3576,7791,3816, # 6518
|
||||
509,2841,2453,2876,3817,7792,7793,3152,3153,4496,4202,2531,4497,2304,1166,1010, # 6534
|
||||
552, 681,1887,7794,7795,2957,2958,3996,1287,1596,1861,3154, 358, 453, 736, 175, # 6550
|
||||
478,1117, 905,1167,1097,7796,1853,1530,7797,1706,7798,2178,3472,2287,3695,3473, # 6566
|
||||
3577,4203,2092,4204,7799,3367,1193,2482,4205,1458,2190,2205,1862,1888,1421,3298, # 6582
|
||||
2918,3049,2179,3474, 595,2122,7800,3997,7801,7802,4206,1707,2636, 223,3696,1359, # 6598
|
||||
751,3098, 183,3475,7803,2797,3003, 419,2369, 633, 704,3818,2389, 241,7804,7805, # 6614
|
||||
7806, 838,3004,3697,2272,2763,2454,3819,1938,2050,3998,1309,3099,2242,1181,7807, # 6630
|
||||
1136,2206,3820,2370,1446,4207,2305,4498,7808,7809,4208,1055,2605, 484,3698,7810, # 6646
|
||||
3999, 625,4209,2273,3368,1499,4210,4000,7811,4001,4211,3222,2274,2275,3476,7812, # 6662
|
||||
7813,2764, 808,2606,3699,3369,4002,4212,3100,2532, 526,3370,3821,4213, 955,7814, # 6678
|
||||
1620,4214,2637,2427,7815,1429,3700,1669,1831, 994, 928,7816,3578,1260,7817,7818, # 6694
|
||||
7819,1948,2288, 741,2919,1626,4215,2729,2455, 867,1184, 362,3371,1392,7820,7821, # 6710
|
||||
4003,4216,1770,1736,3223,2920,4499,4500,1928,2698,1459,1158,7822,3050,3372,2877, # 6726
|
||||
1292,1929,2506,2842,3701,1985,1187,2071,2014,2607,4217,7823,2566,2507,2169,3702, # 6742
|
||||
2483,3299,7824,3703,4501,7825,7826, 666,1003,3005,1022,3579,4218,7827,4502,1813, # 6758
|
||||
2253, 574,3822,1603, 295,1535, 705,3823,4219, 283, 858, 417,7828,7829,3224,4503, # 6774
|
||||
4504,3051,1220,1889,1046,2276,2456,4004,1393,1599, 689,2567, 388,4220,7830,2484, # 6790
|
||||
802,7831,2798,3824,2060,1405,2254,7832,4505,3825,2109,1052,1345,3225,1585,7833, # 6806
|
||||
809,7834,7835,7836, 575,2730,3477, 956,1552,1469,1144,2323,7837,2324,1560,2457, # 6822
|
||||
3580,3226,4005, 616,2207,3155,2180,2289,7838,1832,7839,3478,4506,7840,1319,3704, # 6838
|
||||
3705,1211,3581,1023,3227,1293,2799,7841,7842,7843,3826, 607,2306,3827, 762,2878, # 6854
|
||||
1439,4221,1360,7844,1485,3052,7845,4507,1038,4222,1450,2061,2638,4223,1379,4508, # 6870
|
||||
2585,7846,7847,4224,1352,1414,2325,2921,1172,7848,7849,3828,3829,7850,1797,1451, # 6886
|
||||
7851,7852,7853,7854,2922,4006,4007,2485,2346, 411,4008,4009,3582,3300,3101,4509, # 6902
|
||||
1561,2664,1452,4010,1375,7855,7856, 47,2959, 316,7857,1406,1591,2923,3156,7858, # 6918
|
||||
1025,2141,3102,3157, 354,2731, 884,2224,4225,2407, 508,3706, 726,3583, 996,2428, # 6934
|
||||
3584, 729,7859, 392,2191,1453,4011,4510,3707,7860,7861,2458,3585,2608,1675,2800, # 6950
|
||||
919,2347,2960,2348,1270,4511,4012, 73,7862,7863, 647,7864,3228,2843,2255,1550, # 6966
|
||||
1346,3006,7865,1332, 883,3479,7866,7867,7868,7869,3301,2765,7870,1212, 831,1347, # 6982
|
||||
4226,4512,2326,3830,1863,3053, 720,3831,4513,4514,3832,7871,4227,7872,7873,4515, # 6998
|
||||
7874,7875,1798,4516,3708,2609,4517,3586,1645,2371,7876,7877,2924, 669,2208,2665, # 7014
|
||||
2429,7878,2879,7879,7880,1028,3229,7881,4228,2408,7882,2256,1353,7883,7884,4518, # 7030
|
||||
3158, 518,7885,4013,7886,4229,1960,7887,2142,4230,7888,7889,3007,2349,2350,3833, # 7046
|
||||
516,1833,1454,4014,2699,4231,4519,2225,2610,1971,1129,3587,7890,2766,7891,2961, # 7062
|
||||
1422, 577,1470,3008,1524,3373,7892,7893, 432,4232,3054,3480,7894,2586,1455,2508, # 7078
|
||||
2226,1972,1175,7895,1020,2732,4015,3481,4520,7896,2733,7897,1743,1361,3055,3482, # 7094
|
||||
2639,4016,4233,4521,2290, 895, 924,4234,2170, 331,2243,3056, 166,1627,3057,1098, # 7110
|
||||
7898,1232,2880,2227,3374,4522, 657, 403,1196,2372, 542,3709,3375,1600,4235,3483, # 7126
|
||||
7899,4523,2767,3230, 576, 530,1362,7900,4524,2533,2666,3710,4017,7901, 842,3834, # 7142
|
||||
7902,2801,2031,1014,4018, 213,2700,3376, 665, 621,4236,7903,3711,2925,2430,7904, # 7158
|
||||
2431,3302,3588,3377,7905,4237,2534,4238,4525,3589,1682,4239,3484,1380,7906, 724, # 7174
|
||||
2277, 600,1670,7907,1337,1233,4526,3103,2244,7908,1621,4527,7909, 651,4240,7910, # 7190
|
||||
1612,4241,2611,7911,2844,7912,2734,2307,3058,7913, 716,2459,3059, 174,1255,2701, # 7206
|
||||
4019,3590, 548,1320,1398, 728,4020,1574,7914,1890,1197,3060,4021,7915,3061,3062, # 7222
|
||||
3712,3591,3713, 747,7916, 635,4242,4528,7917,7918,7919,4243,7920,7921,4529,7922, # 7238
|
||||
3378,4530,2432, 451,7923,3714,2535,2072,4244,2735,4245,4022,7924,1764,4531,7925, # 7254
|
||||
4246, 350,7926,2278,2390,2486,7927,4247,4023,2245,1434,4024, 488,4532, 458,4248, # 7270
|
||||
4025,3715, 771,1330,2391,3835,2568,3159,2159,2409,1553,2667,3160,4249,7928,2487, # 7286
|
||||
2881,2612,1720,2702,4250,3379,4533,7929,2536,4251,7930,3231,4252,2768,7931,2015, # 7302
|
||||
2736,7932,1155,1017,3716,3836,7933,3303,2308, 201,1864,4253,1430,7934,4026,7935, # 7318
|
||||
7936,7937,7938,7939,4254,1604,7940, 414,1865, 371,2587,4534,4535,3485,2016,3104, # 7334
|
||||
4536,1708, 960,4255, 887, 389,2171,1536,1663,1721,7941,2228,4027,2351,2926,1580, # 7350
|
||||
7942,7943,7944,1744,7945,2537,4537,4538,7946,4539,7947,2073,7948,7949,3592,3380, # 7366
|
||||
2882,4256,7950,4257,2640,3381,2802, 673,2703,2460, 709,3486,4028,3593,4258,7951, # 7382
|
||||
1148, 502, 634,7952,7953,1204,4540,3594,1575,4541,2613,3717,7954,3718,3105, 948, # 7398
|
||||
3232, 121,1745,3837,1110,7955,4259,3063,2509,3009,4029,3719,1151,1771,3838,1488, # 7414
|
||||
4030,1986,7956,2433,3487,7957,7958,2093,7959,4260,3839,1213,1407,2803, 531,2737, # 7430
|
||||
2538,3233,1011,1537,7960,2769,4261,3106,1061,7961,3720,3721,1866,2883,7962,2017, # 7446
|
||||
120,4262,4263,2062,3595,3234,2309,3840,2668,3382,1954,4542,7963,7964,3488,1047, # 7462
|
||||
2704,1266,7965,1368,4543,2845, 649,3383,3841,2539,2738,1102,2846,2669,7966,7967, # 7478
|
||||
1999,7968,1111,3596,2962,7969,2488,3842,3597,2804,1854,3384,3722,7970,7971,3385, # 7494
|
||||
2410,2884,3304,3235,3598,7972,2569,7973,3599,2805,4031,1460, 856,7974,3600,7975, # 7510
|
||||
2885,2963,7976,2886,3843,7977,4264, 632,2510, 875,3844,1697,3845,2291,7978,7979, # 7526
|
||||
4544,3010,1239, 580,4545,4265,7980, 914, 936,2074,1190,4032,1039,2123,7981,7982, # 7542
|
||||
7983,3386,1473,7984,1354,4266,3846,7985,2172,3064,4033, 915,3305,4267,4268,3306, # 7558
|
||||
1605,1834,7986,2739, 398,3601,4269,3847,4034, 328,1912,2847,4035,3848,1331,4270, # 7574
|
||||
3011, 937,4271,7987,3602,4036,4037,3387,2160,4546,3388, 524, 742, 538,3065,1012, # 7590
|
||||
7988,7989,3849,2461,7990, 658,1103, 225,3850,7991,7992,4547,7993,4548,7994,3236, # 7606
|
||||
1243,7995,4038, 963,2246,4549,7996,2705,3603,3161,7997,7998,2588,2327,7999,4550, # 7622
|
||||
8000,8001,8002,3489,3307, 957,3389,2540,2032,1930,2927,2462, 870,2018,3604,1746, # 7638
|
||||
2770,2771,2434,2463,8003,3851,8004,3723,3107,3724,3490,3390,3725,8005,1179,3066, # 7654
|
||||
8006,3162,2373,4272,3726,2541,3163,3108,2740,4039,8007,3391,1556,2542,2292, 977, # 7670
|
||||
2887,2033,4040,1205,3392,8008,1765,3393,3164,2124,1271,1689, 714,4551,3491,8009, # 7686
|
||||
2328,3852, 533,4273,3605,2181, 617,8010,2464,3308,3492,2310,8011,8012,3165,8013, # 7702
|
||||
8014,3853,1987, 618, 427,2641,3493,3394,8015,8016,1244,1690,8017,2806,4274,4552, # 7718
|
||||
8018,3494,8019,8020,2279,1576, 473,3606,4275,3395, 972,8021,3607,8022,3067,8023, # 7734
|
||||
8024,4553,4554,8025,3727,4041,4042,8026, 153,4555, 356,8027,1891,2888,4276,2143, # 7750
|
||||
408, 803,2352,8028,3854,8029,4277,1646,2570,2511,4556,4557,3855,8030,3856,4278, # 7766
|
||||
8031,2411,3396, 752,8032,8033,1961,2964,8034, 746,3012,2465,8035,4279,3728, 698, # 7782
|
||||
4558,1892,4280,3608,2543,4559,3609,3857,8036,3166,3397,8037,1823,1302,4043,2706, # 7798
|
||||
3858,1973,4281,8038,4282,3167, 823,1303,1288,1236,2848,3495,4044,3398, 774,3859, # 7814
|
||||
8039,1581,4560,1304,2849,3860,4561,8040,2435,2161,1083,3237,4283,4045,4284, 344, # 7830
|
||||
1173, 288,2311, 454,1683,8041,8042,1461,4562,4046,2589,8043,8044,4563, 985, 894, # 7846
|
||||
8045,3399,3168,8046,1913,2928,3729,1988,8047,2110,1974,8048,4047,8049,2571,1194, # 7862
|
||||
425,8050,4564,3169,1245,3730,4285,8051,8052,2850,8053, 636,4565,1855,3861, 760, # 7878
|
||||
1799,8054,4286,2209,1508,4566,4048,1893,1684,2293,8055,8056,8057,4287,4288,2210, # 7894
|
||||
479,8058,8059, 832,8060,4049,2489,8061,2965,2490,3731, 990,3109, 627,1814,2642, # 7910
|
||||
4289,1582,4290,2125,2111,3496,4567,8062, 799,4291,3170,8063,4568,2112,1737,3013, # 7926
|
||||
1018, 543, 754,4292,3309,1676,4569,4570,4050,8064,1489,8065,3497,8066,2614,2889, # 7942
|
||||
4051,8067,8068,2966,8069,8070,8071,8072,3171,4571,4572,2182,1722,8073,3238,3239, # 7958
|
||||
1842,3610,1715, 481, 365,1975,1856,8074,8075,1962,2491,4573,8076,2126,3611,3240, # 7974
|
||||
433,1894,2063,2075,8077, 602,2741,8078,8079,8080,8081,8082,3014,1628,3400,8083, # 7990
|
||||
3172,4574,4052,2890,4575,2512,8084,2544,2772,8085,8086,8087,3310,4576,2891,8088, # 8006
|
||||
4577,8089,2851,4578,4579,1221,2967,4053,2513,8090,8091,8092,1867,1989,8093,8094, # 8022
|
||||
8095,1895,8096,8097,4580,1896,4054, 318,8098,2094,4055,4293,8099,8100, 485,8101, # 8038
|
||||
938,3862, 553,2670, 116,8102,3863,3612,8103,3498,2671,2773,3401,3311,2807,8104, # 8054
|
||||
3613,2929,4056,1747,2930,2968,8105,8106, 207,8107,8108,2672,4581,2514,8109,3015, # 8070
|
||||
890,3614,3864,8110,1877,3732,3402,8111,2183,2353,3403,1652,8112,8113,8114, 941, # 8086
|
||||
2294, 208,3499,4057,2019, 330,4294,3865,2892,2492,3733,4295,8115,8116,8117,8118, # 8102
|
||||
1, 1800, 1506, 255, 1431, 198, 9, 82, 6, 7310, 177, 202, 3615, 1256, 2808, 110, # 2742
|
||||
3735, 33, 3241, 261, 76, 44, 2113, 16, 2931, 2184, 1176, 659, 3868, 26, 3404, 2643, # 2758
|
||||
1198, 3869, 3313, 4060, 410, 2211, 302, 590, 361, 1963, 8, 204, 58, 4296, 7311, 1931, # 2774
|
||||
63, 7312, 7313, 317, 1614, 75, 222, 159, 4061, 2412, 1480, 7314, 3500, 3068, 224, 2809, # 2790
|
||||
3616, 3, 10, 3870, 1471, 29, 2774, 1135, 2852, 1939, 873, 130, 3242, 1123, 312, 7315, # 2806
|
||||
4297, 2051, 507, 252, 682, 7316, 142, 1914, 124, 206, 2932, 34, 3501, 3173, 64, 604, # 2822
|
||||
7317, 2494, 1976, 1977, 155, 1990, 645, 641, 1606, 7318, 3405, 337, 72, 406, 7319, 80, # 2838
|
||||
630, 238, 3174, 1509, 263, 939, 1092, 2644, 756, 1440, 1094, 3406, 449, 69, 2969, 591, # 2854
|
||||
179, 2095, 471, 115, 2034, 1843, 60, 50, 2970, 134, 806, 1868, 734, 2035, 3407, 180, # 2870
|
||||
995, 1607, 156, 537, 2893, 688, 7320, 319, 1305, 779, 2144, 514, 2374, 298, 4298, 359, # 2886
|
||||
2495, 90, 2707, 1338, 663, 11, 906, 1099, 2545, 20, 2436, 182, 532, 1716, 7321, 732, # 2902
|
||||
1376, 4062, 1311, 1420, 3175, 25, 2312, 1056, 113, 399, 382, 1949, 242, 3408, 2467, 529, # 2918
|
||||
3243, 475, 1447, 3617, 7322, 117, 21, 656, 810, 1297, 2295, 2329, 3502, 7323, 126, 4063, # 2934
|
||||
706, 456, 150, 613, 4299, 71, 1118, 2036, 4064, 145, 3069, 85, 835, 486, 2114, 1246, # 2950
|
||||
1426, 428, 727, 1285, 1015, 800, 106, 623, 303, 1281, 7324, 2127, 2354, 347, 3736, 221, # 2966
|
||||
3503, 3110, 7325, 1955, 1153, 4065, 83, 296, 1199, 3070, 192, 624, 93, 7326, 822, 1897, # 2982
|
||||
2810, 3111, 795, 2064, 991, 1554, 1542, 1592, 27, 43, 2853, 859, 139, 1456, 860, 4300, # 2998
|
||||
437, 712, 3871, 164, 2392, 3112, 695, 211, 3017, 2096, 195, 3872, 1608, 3504, 3505, 3618, # 3014
|
||||
3873, 234, 811, 2971, 2097, 3874, 2229, 1441, 3506, 1615, 2375, 668, 2076, 1638, 305, 228, # 3030
|
||||
1664, 4301, 467, 415, 7327, 262, 2098, 1593, 239, 108, 300, 200, 1033, 512, 1247, 2077, # 3046
|
||||
7328, 7329, 2173, 3176, 3619, 2673, 593, 845, 1062, 3244, 88, 1723, 2037, 3875, 1950, 212, # 3062
|
||||
266, 152, 149, 468, 1898, 4066, 4302, 77, 187, 7330, 3018, 37, 5, 2972, 7331, 3876, # 3078
|
||||
7332, 7333, 39, 2517, 4303, 2894, 3177, 2078, 55, 148, 74, 4304, 545, 483, 1474, 1029, # 3094
|
||||
1665, 217, 1869, 1531, 3113, 1104, 2645, 4067, 24, 172, 3507, 900, 3877, 3508, 3509, 4305, # 3110
|
||||
32, 1408, 2811, 1312, 329, 487, 2355, 2247, 2708, 784, 2674, 4, 3019, 3314, 1427, 1788, # 3126
|
||||
188, 109, 499, 7334, 3620, 1717, 1789, 888, 1217, 3020, 4306, 7335, 3510, 7336, 3315, 1520, # 3142
|
||||
3621, 3878, 196, 1034, 775, 7337, 7338, 929, 1815, 249, 439, 38, 7339, 1063, 7340, 794, # 3158
|
||||
3879, 1435, 2296, 46, 178, 3245, 2065, 7341, 2376, 7342, 214, 1709, 4307, 804, 35, 707, # 3174
|
||||
324, 3622, 1601, 2546, 140, 459, 4068, 7343, 7344, 1365, 839, 272, 978, 2257, 2572, 3409, # 3190
|
||||
2128, 1363, 3623, 1423, 697, 100, 3071, 48, 70, 1231, 495, 3114, 2193, 7345, 1294, 7346, # 3206
|
||||
2079, 462, 586, 1042, 3246, 853, 256, 988, 185, 2377, 3410, 1698, 434, 1084, 7347, 3411, # 3222
|
||||
314, 2615, 2775, 4308, 2330, 2331, 569, 2280, 637, 1816, 2518, 757, 1162, 1878, 1616, 3412, # 3238
|
||||
287, 1577, 2115, 768, 4309, 1671, 2854, 3511, 2519, 1321, 3737, 909, 2413, 7348, 4069, 933, # 3254
|
||||
3738, 7349, 2052, 2356, 1222, 4310, 765, 2414, 1322, 786, 4311, 7350, 1919, 1462, 1677, 2895, # 3270
|
||||
1699, 7351, 4312, 1424, 2437, 3115, 3624, 2590, 3316, 1774, 1940, 3413, 3880, 4070, 309, 1369, # 3286
|
||||
1130, 2812, 364, 2230, 1653, 1299, 3881, 3512, 3882, 3883, 2646, 525, 1085, 3021, 902, 2000, # 3302
|
||||
1475, 964, 4313, 421, 1844, 1415, 1057, 2281, 940, 1364, 3116, 376, 4314, 4315, 1381, 7, # 3318
|
||||
2520, 983, 2378, 336, 1710, 2675, 1845, 321, 3414, 559, 1131, 3022, 2742, 1808, 1132, 1313, # 3334
|
||||
265, 1481, 1857, 7352, 352, 1203, 2813, 3247, 167, 1089, 420, 2814, 776, 792, 1724, 3513, # 3350
|
||||
4071, 2438, 3248, 7353, 4072, 7354, 446, 229, 333, 2743, 901, 3739, 1200, 1557, 4316, 2647, # 3366
|
||||
1920, 395, 2744, 2676, 3740, 4073, 1835, 125, 916, 3178, 2616, 4317, 7355, 7356, 3741, 7357, # 3382
|
||||
7358, 7359, 4318, 3117, 3625, 1133, 2547, 1757, 3415, 1510, 2313, 1409, 3514, 7360, 2145, 438, # 3398
|
||||
2591, 2896, 2379, 3317, 1068, 958, 3023, 461, 311, 2855, 2677, 4074, 1915, 3179, 4075, 1978, # 3414
|
||||
383, 750, 2745, 2617, 4076, 274, 539, 385, 1278, 1442, 7361, 1154, 1964, 384, 561, 210, # 3430
|
||||
98, 1295, 2548, 3515, 7362, 1711, 2415, 1482, 3416, 3884, 2897, 1257, 129, 7363, 3742, 642, # 3446
|
||||
523, 2776, 2777, 2648, 7364, 141, 2231, 1333, 68, 176, 441, 876, 907, 4077, 603, 2592, # 3462
|
||||
710, 171, 3417, 404, 549, 18, 3118, 2393, 1410, 3626, 1666, 7365, 3516, 4319, 2898, 4320, # 3478
|
||||
7366, 2973, 368, 7367, 146, 366, 99, 871, 3627, 1543, 748, 807, 1586, 1185, 22, 2258, # 3494
|
||||
379, 3743, 3180, 7368, 3181, 505, 1941, 2618, 1991, 1382, 2314, 7369, 380, 2357, 218, 702, # 3510
|
||||
1817, 1248, 3418, 3024, 3517, 3318, 3249, 7370, 2974, 3628, 930, 3250, 3744, 7371, 59, 7372, # 3526
|
||||
585, 601, 4078, 497, 3419, 1112, 1314, 4321, 1801, 7373, 1223, 1472, 2174, 7374, 749, 1836, # 3542
|
||||
690, 1899, 3745, 1772, 3885, 1476, 429, 1043, 1790, 2232, 2116, 917, 4079, 447, 1086, 1629, # 3558
|
||||
7375, 556, 7376, 7377, 2020, 1654, 844, 1090, 105, 550, 966, 1758, 2815, 1008, 1782, 686, # 3574
|
||||
1095, 7378, 2282, 793, 1602, 7379, 3518, 2593, 4322, 4080, 2933, 2297, 4323, 3746, 980, 2496, # 3590
|
||||
544, 353, 527, 4324, 908, 2678, 2899, 7380, 381, 2619, 1942, 1348, 7381, 1341, 1252, 560, # 3606
|
||||
3072, 7382, 3420, 2856, 7383, 2053, 973, 886, 2080, 143, 4325, 7384, 7385, 157, 3886, 496, # 3622
|
||||
4081, 57, 840, 540, 2038, 4326, 4327, 3421, 2117, 1445, 970, 2259, 1748, 1965, 2081, 4082, # 3638
|
||||
3119, 1234, 1775, 3251, 2816, 3629, 773, 1206, 2129, 1066, 2039, 1326, 3887, 1738, 1725, 4083, # 3654
|
||||
279, 3120, 51, 1544, 2594, 423, 1578, 2130, 2066, 173, 4328, 1879, 7386, 7387, 1583, 264, # 3670
|
||||
610, 3630, 4329, 2439, 280, 154, 7388, 7389, 7390, 1739, 338, 1282, 3073, 693, 2857, 1411, # 3686
|
||||
1074, 3747, 2440, 7391, 4330, 7392, 7393, 1240, 952, 2394, 7394, 2900, 1538, 2679, 685, 1483, # 3702
|
||||
4084, 2468, 1436, 953, 4085, 2054, 4331, 671, 2395, 79, 4086, 2441, 3252, 608, 567, 2680, # 3718
|
||||
3422, 4087, 4088, 1691, 393, 1261, 1791, 2396, 7395, 4332, 7396, 7397, 7398, 7399, 1383, 1672, # 3734
|
||||
3748, 3182, 1464, 522, 1119, 661, 1150, 216, 675, 4333, 3888, 1432, 3519, 609, 4334, 2681, # 3750
|
||||
2397, 7400, 7401, 7402, 4089, 3025, 0, 7403, 2469, 315, 231, 2442, 301, 3319, 4335, 2380, # 3766
|
||||
7404, 233, 4090, 3631, 1818, 4336, 4337, 7405, 96, 1776, 1315, 2082, 7406, 257, 7407, 1809, # 3782
|
||||
3632, 2709, 1139, 1819, 4091, 2021, 1124, 2163, 2778, 1777, 2649, 7408, 3074, 363, 1655, 3183, # 3798
|
||||
7409, 2975, 7410, 7411, 7412, 3889, 1567, 3890, 718, 103, 3184, 849, 1443, 341, 3320, 2934, # 3814
|
||||
1484, 7413, 1712, 127, 67, 339, 4092, 2398, 679, 1412, 821, 7414, 7415, 834, 738, 351, # 3830
|
||||
2976, 2146, 846, 235, 1497, 1880, 418, 1992, 3749, 2710, 186, 1100, 2147, 2746, 3520, 1545, # 3846
|
||||
1355, 2935, 2858, 1377, 583, 3891, 4093, 2573, 2977, 7416, 1298, 3633, 1078, 2549, 3634, 2358, # 3862
|
||||
78, 3750, 3751, 267, 1289, 2099, 2001, 1594, 4094, 348, 369, 1274, 2194, 2175, 1837, 4338, # 3878
|
||||
1820, 2817, 3635, 2747, 2283, 2002, 4339, 2936, 2748, 144, 3321, 882, 4340, 3892, 2749, 3423, # 3894
|
||||
4341, 2901, 7417, 4095, 1726, 320, 7418, 3893, 3026, 788, 2978, 7419, 2818, 1773, 1327, 2859, # 3910
|
||||
3894, 2819, 7420, 1306, 4342, 2003, 1700, 3752, 3521, 2359, 2650, 787, 2022, 506, 824, 3636, # 3926
|
||||
534, 323, 4343, 1044, 3322, 2023, 1900, 946, 3424, 7421, 1778, 1500, 1678, 7422, 1881, 4344, # 3942
|
||||
165, 243, 4345, 3637, 2521, 123, 683, 4096, 764, 4346, 36, 3895, 1792, 589, 2902, 816, # 3958
|
||||
626, 1667, 3027, 2233, 1639, 1555, 1622, 3753, 3896, 7423, 3897, 2860, 1370, 1228, 1932, 891, # 3974
|
||||
2083, 2903, 304, 4097, 7424, 292, 2979, 2711, 3522, 691, 2100, 4098, 1115, 4347, 118, 662, # 3990
|
||||
7425, 611, 1156, 854, 2381, 1316, 2861, 2, 386, 515, 2904, 7426, 7427, 3253, 868, 2234, # 4006
|
||||
1486, 855, 2651, 785, 2212, 3028, 7428, 1040, 3185, 3523, 7429, 3121, 448, 7430, 1525, 7431, # 4022
|
||||
2164, 4348, 7432, 3754, 7433, 4099, 2820, 3524, 3122, 503, 818, 3898, 3123, 1568, 814, 676, # 4038
|
||||
1444, 306, 1749, 7434, 3755, 1416, 1030, 197, 1428, 805, 2821, 1501, 4349, 7435, 7436, 7437, # 4054
|
||||
1993, 7438, 4350, 7439, 7440, 2195, 13, 2779, 3638, 2980, 3124, 1229, 1916, 7441, 3756, 2131, # 4070
|
||||
7442, 4100, 4351, 2399, 3525, 7443, 2213, 1511, 1727, 1120, 7444, 7445, 646, 3757, 2443, 307, # 4086
|
||||
7446, 7447, 1595, 3186, 7448, 7449, 7450, 3639, 1113, 1356, 3899, 1465, 2522, 2523, 7451, 519, # 4102
|
||||
7452, 128, 2132, 92, 2284, 1979, 7453, 3900, 1512, 342, 3125, 2196, 7454, 2780, 2214, 1980, # 4118
|
||||
3323, 7455, 290, 1656, 1317, 789, 827, 2360, 7456, 3758, 4352, 562, 581, 3901, 7457, 401, # 4134
|
||||
4353, 2248, 94, 4354, 1399, 2781, 7458, 1463, 2024, 4355, 3187, 1943, 7459, 828, 1105, 4101, # 4150
|
||||
1262, 1394, 7460, 4102, 605, 4356, 7461, 1783, 2862, 7462, 2822, 819, 2101, 578, 2197, 2937, # 4166
|
||||
7463, 1502, 436, 3254, 4103, 3255, 2823, 3902, 2905, 3425, 3426, 7464, 2712, 2315, 7465, 7466, # 4182
|
||||
2332, 2067, 23, 4357, 193, 826, 3759, 2102, 699, 1630, 4104, 3075, 390, 1793, 1064, 3526, # 4198
|
||||
7467, 1579, 3076, 3077, 1400, 7468, 4105, 1838, 1640, 2863, 7469, 4358, 4359, 137, 4106, 598, # 4214
|
||||
3078, 1966, 780, 104, 974, 2938, 7470, 278, 899, 253, 402, 572, 504, 493, 1339, 7471, # 4230
|
||||
3903, 1275, 4360, 2574, 2550, 7472, 3640, 3029, 3079, 2249, 565, 1334, 2713, 863, 41, 7473, # 4246
|
||||
7474, 4361, 7475, 1657, 2333, 19, 463, 2750, 4107, 606, 7476, 2981, 3256, 1087, 2084, 1323, # 4262
|
||||
2652, 2982, 7477, 1631, 1623, 1750, 4108, 2682, 7478, 2864, 791, 2714, 2653, 2334, 232, 2416, # 4278
|
||||
7479, 2983, 1498, 7480, 2654, 2620, 755, 1366, 3641, 3257, 3126, 2025, 1609, 119, 1917, 3427, # 4294
|
||||
862, 1026, 4109, 7481, 3904, 3760, 4362, 3905, 4363, 2260, 1951, 2470, 7482, 1125, 817, 4110, # 4310
|
||||
4111, 3906, 1513, 1766, 2040, 1487, 4112, 3030, 3258, 2824, 3761, 3127, 7483, 7484, 1507, 7485, # 4326
|
||||
2683, 733, 40, 1632, 1106, 2865, 345, 4113, 841, 2524, 230, 4364, 2984, 1846, 3259, 3428, # 4342
|
||||
7486, 1263, 986, 3429, 7487, 735, 879, 254, 1137, 857, 622, 1300, 1180, 1388, 1562, 3907, # 4358
|
||||
3908, 2939, 967, 2751, 2655, 1349, 592, 2133, 1692, 3324, 2985, 1994, 4114, 1679, 3909, 1901, # 4374
|
||||
2185, 7488, 739, 3642, 2715, 1296, 1290, 7489, 4115, 2198, 2199, 1921, 1563, 2595, 2551, 1870, # 4390
|
||||
2752, 2986, 7490, 435, 7491, 343, 1108, 596, 17, 1751, 4365, 2235, 3430, 3643, 7492, 4366, # 4406
|
||||
294, 3527, 2940, 1693, 477, 979, 281, 2041, 3528, 643, 2042, 3644, 2621, 2782, 2261, 1031, # 4422
|
||||
2335, 2134, 2298, 3529, 4367, 367, 1249, 2552, 7493, 3530, 7494, 4368, 1283, 3325, 2004, 240, # 4438
|
||||
1762, 3326, 4369, 4370, 836, 1069, 3128, 474, 7495, 2148, 2525, 268, 3531, 7496, 3188, 1521, # 4454
|
||||
1284, 7497, 1658, 1546, 4116, 7498, 3532, 3533, 7499, 4117, 3327, 2684, 1685, 4118, 961, 1673, # 4470
|
||||
2622, 190, 2005, 2200, 3762, 4371, 4372, 7500, 570, 2497, 3645, 1490, 7501, 4373, 2623, 3260, # 4486
|
||||
1956, 4374, 584, 1514, 396, 1045, 1944, 7502, 4375, 1967, 2444, 7503, 7504, 4376, 3910, 619, # 4502
|
||||
7505, 3129, 3261, 215, 2006, 2783, 2553, 3189, 4377, 3190, 4378, 763, 4119, 3763, 4379, 7506, # 4518
|
||||
7507, 1957, 1767, 2941, 3328, 3646, 1174, 452, 1477, 4380, 3329, 3130, 7508, 2825, 1253, 2382, # 4534
|
||||
2186, 1091, 2285, 4120, 492, 7509, 638, 1169, 1824, 2135, 1752, 3911, 648, 926, 1021, 1324, # 4550
|
||||
4381, 520, 4382, 997, 847, 1007, 892, 4383, 3764, 2262, 1871, 3647, 7510, 2400, 1784, 4384, # 4566
|
||||
1952, 2942, 3080, 3191, 1728, 4121, 2043, 3648, 4385, 2007, 1701, 3131, 1551, 30, 2263, 4122, # 4582
|
||||
7511, 2026, 4386, 3534, 7512, 501, 7513, 4123, 594, 3431, 2165, 1821, 3535, 3432, 3536, 3192, # 4598
|
||||
829, 2826, 4124, 7514, 1680, 3132, 1225, 4125, 7515, 3262, 4387, 4126, 3133, 2336, 7516, 4388, # 4614
|
||||
4127, 7517, 3912, 3913, 7518, 1847, 2383, 2596, 3330, 7519, 4389, 374, 3914, 652, 4128, 4129, # 4630
|
||||
375, 1140, 798, 7520, 7521, 7522, 2361, 4390, 2264, 546, 1659, 138, 3031, 2445, 4391, 7523, # 4646
|
||||
2250, 612, 1848, 910, 796, 3765, 1740, 1371, 825, 3766, 3767, 7524, 2906, 2554, 7525, 692, # 4662
|
||||
444, 3032, 2624, 801, 4392, 4130, 7526, 1491, 244, 1053, 3033, 4131, 4132, 340, 7527, 3915, # 4678
|
||||
1041, 2987, 293, 1168, 87, 1357, 7528, 1539, 959, 7529, 2236, 721, 694, 4133, 3768, 219, # 4694
|
||||
1478, 644, 1417, 3331, 2656, 1413, 1401, 1335, 1389, 3916, 7530, 7531, 2988, 2362, 3134, 1825, # 4710
|
||||
730, 1515, 184, 2827, 66, 4393, 7532, 1660, 2943, 246, 3332, 378, 1457, 226, 3433, 975, # 4726
|
||||
3917, 2944, 1264, 3537, 674, 696, 7533, 163, 7534, 1141, 2417, 2166, 713, 3538, 3333, 4394, # 4742
|
||||
3918, 7535, 7536, 1186, 15, 7537, 1079, 1070, 7538, 1522, 3193, 3539, 276, 1050, 2716, 758, # 4758
|
||||
1126, 653, 2945, 3263, 7539, 2337, 889, 3540, 3919, 3081, 2989, 903, 1250, 4395, 3920, 3434, # 4774
|
||||
3541, 1342, 1681, 1718, 766, 3264, 286, 89, 2946, 3649, 7540, 1713, 7541, 2597, 3334, 2990, # 4790
|
||||
7542, 2947, 2215, 3194, 2866, 7543, 4396, 2498, 2526, 181, 387, 1075, 3921, 731, 2187, 3335, # 4806
|
||||
7544, 3265, 310, 313, 3435, 2299, 770, 4134, 54, 3034, 189, 4397, 3082, 3769, 3922, 7545, # 4822
|
||||
1230, 1617, 1849, 355, 3542, 4135, 4398, 3336, 111, 4136, 3650, 1350, 3135, 3436, 3035, 4137, # 4838
|
||||
2149, 3266, 3543, 7546, 2784, 3923, 3924, 2991, 722, 2008, 7547, 1071, 247, 1207, 2338, 2471, # 4854
|
||||
1378, 4399, 2009, 864, 1437, 1214, 4400, 373, 3770, 1142, 2216, 667, 4401, 442, 2753, 2555, # 4870
|
||||
3771, 3925, 1968, 4138, 3267, 1839, 837, 170, 1107, 934, 1336, 1882, 7548, 7549, 2118, 4139, # 4886
|
||||
2828, 743, 1569, 7550, 4402, 4140, 582, 2384, 1418, 3437, 7551, 1802, 7552, 357, 1395, 1729, # 4902
|
||||
3651, 3268, 2418, 1564, 2237, 7553, 3083, 3772, 1633, 4403, 1114, 2085, 4141, 1532, 7554, 482, # 4918
|
||||
2446, 4404, 7555, 7556, 1492, 833, 1466, 7557, 2717, 3544, 1641, 2829, 7558, 1526, 1272, 3652, # 4934
|
||||
4142, 1686, 1794, 416, 2556, 1902, 1953, 1803, 7559, 3773, 2785, 3774, 1159, 2316, 7560, 2867, # 4950
|
||||
4405, 1610, 1584, 3036, 2419, 2754, 443, 3269, 1163, 3136, 7561, 7562, 3926, 7563, 4143, 2499, # 4966
|
||||
3037, 4406, 3927, 3137, 2103, 1647, 3545, 2010, 1872, 4144, 7564, 4145, 431, 3438, 7565, 250, # 4982
|
||||
97, 81, 4146, 7566, 1648, 1850, 1558, 160, 848, 7567, 866, 740, 1694, 7568, 2201, 2830, # 4998
|
||||
3195, 4147, 4407, 3653, 1687, 950, 2472, 426, 469, 3196, 3654, 3655, 3928, 7569, 7570, 1188, # 5014
|
||||
424, 1995, 861, 3546, 4148, 3775, 2202, 2685, 168, 1235, 3547, 4149, 7571, 2086, 1674, 4408, # 5030
|
||||
3337, 3270, 220, 2557, 1009, 7572, 3776, 670, 2992, 332, 1208, 717, 7573, 7574, 3548, 2447, # 5046
|
||||
3929, 3338, 7575, 513, 7576, 1209, 2868, 3339, 3138, 4409, 1080, 7577, 7578, 7579, 7580, 2527, # 5062
|
||||
3656, 3549, 815, 1587, 3930, 3931, 7581, 3550, 3439, 3777, 1254, 4410, 1328, 3038, 1390, 3932, # 5078
|
||||
1741, 3933, 3778, 3934, 7582, 236, 3779, 2448, 3271, 7583, 7584, 3657, 3780, 1273, 3781, 4411, # 5094
|
||||
7585, 308, 7586, 4412, 245, 4413, 1851, 2473, 1307, 2575, 430, 715, 2136, 2449, 7587, 270, # 5110
|
||||
199, 2869, 3935, 7588, 3551, 2718, 1753, 761, 1754, 725, 1661, 1840, 4414, 3440, 3658, 7589, # 5126
|
||||
7590, 587, 14, 3272, 227, 2598, 326, 480, 2265, 943, 2755, 3552, 291, 650, 1883, 7591, # 5142
|
||||
1702, 1226, 102, 1547, 62, 3441, 904, 4415, 3442, 1164, 4150, 7592, 7593, 1224, 1548, 2756, # 5158
|
||||
391, 498, 1493, 7594, 1386, 1419, 7595, 2055, 1177, 4416, 813, 880, 1081, 2363, 566, 1145, # 5174
|
||||
4417, 2286, 1001, 1035, 2558, 2599, 2238, 394, 1286, 7596, 7597, 2068, 7598, 86, 1494, 1730, # 5190
|
||||
3936, 491, 1588, 745, 897, 2948, 843, 3340, 3937, 2757, 2870, 3273, 1768, 998, 2217, 2069, # 5206
|
||||
397, 1826, 1195, 1969, 3659, 2993, 3341, 284, 7599, 3782, 2500, 2137, 2119, 1903, 7600, 3938, # 5222
|
||||
2150, 3939, 4151, 1036, 3443, 1904, 114, 2559, 4152, 209, 1527, 7601, 7602, 2949, 2831, 2625, # 5238
|
||||
2385, 2719, 3139, 812, 2560, 7603, 3274, 7604, 1559, 737, 1884, 3660, 1210, 885, 28, 2686, # 5254
|
||||
3553, 3783, 7605, 4153, 1004, 1779, 4418, 7606, 346, 1981, 2218, 2687, 4419, 3784, 1742, 797, # 5270
|
||||
1642, 3940, 1933, 1072, 1384, 2151, 896, 3941, 3275, 3661, 3197, 2871, 3554, 7607, 2561, 1958, # 5286
|
||||
4420, 2450, 1785, 7608, 7609, 7610, 3942, 4154, 1005, 1308, 3662, 4155, 2720, 4421, 4422, 1528, # 5302
|
||||
2600, 161, 1178, 4156, 1982, 987, 4423, 1101, 4157, 631, 3943, 1157, 3198, 2420, 1343, 1241, # 5318
|
||||
1016, 2239, 2562, 372, 877, 2339, 2501, 1160, 555, 1934, 911, 3944, 7611, 466, 1170, 169, # 5334
|
||||
1051, 2907, 2688, 3663, 2474, 2994, 1182, 2011, 2563, 1251, 2626, 7612, 992, 2340, 3444, 1540, # 5350
|
||||
2721, 1201, 2070, 2401, 1996, 2475, 7613, 4424, 528, 1922, 2188, 1503, 1873, 1570, 2364, 3342, # 5366
|
||||
3276, 7614, 557, 1073, 7615, 1827, 3445, 2087, 2266, 3140, 3039, 3084, 767, 3085, 2786, 4425, # 5382
|
||||
1006, 4158, 4426, 2341, 1267, 2176, 3664, 3199, 778, 3945, 3200, 2722, 1597, 2657, 7616, 4427, # 5398
|
||||
7617, 3446, 7618, 7619, 7620, 3277, 2689, 1433, 3278, 131, 95, 1504, 3946, 723, 4159, 3141, # 5414
|
||||
1841, 3555, 2758, 2189, 3947, 2027, 2104, 3665, 7621, 2995, 3948, 1218, 7622, 3343, 3201, 3949, # 5430
|
||||
4160, 2576, 248, 1634, 3785, 912, 7623, 2832, 3666, 3040, 3786, 654, 53, 7624, 2996, 7625, # 5446
|
||||
1688, 4428, 777, 3447, 1032, 3950, 1425, 7626, 191, 820, 2120, 2833, 971, 4429, 931, 3202, # 5462
|
||||
135, 664, 783, 3787, 1997, 772, 2908, 1935, 3951, 3788, 4430, 2909, 3203, 282, 2723, 640, # 5478
|
||||
1372, 3448, 1127, 922, 325, 3344, 7627, 7628, 711, 2044, 7629, 7630, 3952, 2219, 2787, 1936, # 5494
|
||||
3953, 3345, 2220, 2251, 3789, 2300, 7631, 4431, 3790, 1258, 3279, 3954, 3204, 2138, 2950, 3955, # 5510
|
||||
3956, 7632, 2221, 258, 3205, 4432, 101, 1227, 7633, 3280, 1755, 7634, 1391, 3281, 7635, 2910, # 5526
|
||||
2056, 893, 7636, 7637, 7638, 1402, 4161, 2342, 7639, 7640, 3206, 3556, 7641, 7642, 878, 1325, # 5542
|
||||
1780, 2788, 4433, 259, 1385, 2577, 744, 1183, 2267, 4434, 7643, 3957, 2502, 7644, 684, 1024, # 5558
|
||||
4162, 7645, 472, 3557, 3449, 1165, 3282, 3958, 3959, 322, 2152, 881, 455, 1695, 1152, 1340, # 5574
|
||||
660, 554, 2153, 4435, 1058, 4436, 4163, 830, 1065, 3346, 3960, 4437, 1923, 7646, 1703, 1918, # 5590
|
||||
7647, 932, 2268, 122, 7648, 4438, 947, 677, 7649, 3791, 2627, 297, 1905, 1924, 2269, 4439, # 5606
|
||||
2317, 3283, 7650, 7651, 4164, 7652, 4165, 84, 4166, 112, 989, 7653, 547, 1059, 3961, 701, # 5622
|
||||
3558, 1019, 7654, 4167, 7655, 3450, 942, 639, 457, 2301, 2451, 993, 2951, 407, 851, 494, # 5638
|
||||
4440, 3347, 927, 7656, 1237, 7657, 2421, 3348, 573, 4168, 680, 921, 2911, 1279, 1874, 285, # 5654
|
||||
790, 1448, 1983, 719, 2167, 7658, 7659, 4441, 3962, 3963, 1649, 7660, 1541, 563, 7661, 1077, # 5670
|
||||
7662, 3349, 3041, 3451, 511, 2997, 3964, 3965, 3667, 3966, 1268, 2564, 3350, 3207, 4442, 4443, # 5686
|
||||
7663, 535, 1048, 1276, 1189, 2912, 2028, 3142, 1438, 1373, 2834, 2952, 1134, 2012, 7664, 4169, # 5702
|
||||
1238, 2578, 3086, 1259, 7665, 700, 7666, 2953, 3143, 3668, 4170, 7667, 4171, 1146, 1875, 1906, # 5718
|
||||
4444, 2601, 3967, 781, 2422, 132, 1589, 203, 147, 273, 2789, 2402, 898, 1786, 2154, 3968, # 5734
|
||||
3969, 7668, 3792, 2790, 7669, 7670, 4445, 4446, 7671, 3208, 7672, 1635, 3793, 965, 7673, 1804, # 5750
|
||||
2690, 1516, 3559, 1121, 1082, 1329, 3284, 3970, 1449, 3794, 65, 1128, 2835, 2913, 2759, 1590, # 5766
|
||||
3795, 7674, 7675, 12, 2658, 45, 976, 2579, 3144, 4447, 517, 2528, 1013, 1037, 3209, 7676, # 5782
|
||||
3796, 2836, 7677, 3797, 7678, 3452, 7679, 2602, 614, 1998, 2318, 3798, 3087, 2724, 2628, 7680, # 5798
|
||||
2580, 4172, 599, 1269, 7681, 1810, 3669, 7682, 2691, 3088, 759, 1060, 489, 1805, 3351, 3285, # 5814
|
||||
1358, 7683, 7684, 2386, 1387, 1215, 2629, 2252, 490, 7685, 7686, 4173, 1759, 2387, 2343, 7687, # 5830
|
||||
4448, 3799, 1907, 3971, 2630, 1806, 3210, 4449, 3453, 3286, 2760, 2344, 874, 7688, 7689, 3454, # 5846
|
||||
3670, 1858, 91, 2914, 3671, 3042, 3800, 4450, 7690, 3145, 3972, 2659, 7691, 3455, 1202, 1403, # 5862
|
||||
3801, 2954, 2529, 1517, 2503, 4451, 3456, 2504, 7692, 4452, 7693, 2692, 1885, 1495, 1731, 3973, # 5878
|
||||
2365, 4453, 7694, 2029, 7695, 7696, 3974, 2693, 1216, 237, 2581, 4174, 2319, 3975, 3802, 4454, # 5894
|
||||
4455, 2694, 3560, 3457, 445, 4456, 7697, 7698, 7699, 7700, 2761, 61, 3976, 3672, 1822, 3977, # 5910
|
||||
7701, 687, 2045, 935, 925, 405, 2660, 703, 1096, 1859, 2725, 4457, 3978, 1876, 1367, 2695, # 5926
|
||||
3352, 918, 2105, 1781, 2476, 334, 3287, 1611, 1093, 4458, 564, 3146, 3458, 3673, 3353, 945, # 5942
|
||||
2631, 2057, 4459, 7702, 1925, 872, 4175, 7703, 3459, 2696, 3089, 349, 4176, 3674, 3979, 4460, # 5958
|
||||
3803, 4177, 3675, 2155, 3980, 4461, 4462, 4178, 4463, 2403, 2046, 782, 3981, 400, 251, 4179, # 5974
|
||||
1624, 7704, 7705, 277, 3676, 299, 1265, 476, 1191, 3804, 2121, 4180, 4181, 1109, 205, 7706, # 5990
|
||||
2582, 1000, 2156, 3561, 1860, 7707, 7708, 7709, 4464, 7710, 4465, 2565, 107, 2477, 2157, 3982, # 6006
|
||||
3460, 3147, 7711, 1533, 541, 1301, 158, 753, 4182, 2872, 3562, 7712, 1696, 370, 1088, 4183, # 6022
|
||||
4466, 3563, 579, 327, 440, 162, 2240, 269, 1937, 1374, 3461, 968, 3043, 56, 1396, 3090, # 6038
|
||||
2106, 3288, 3354, 7713, 1926, 2158, 4467, 2998, 7714, 3564, 7715, 7716, 3677, 4468, 2478, 7717, # 6054
|
||||
2791, 7718, 1650, 4469, 7719, 2603, 7720, 7721, 3983, 2661, 3355, 1149, 3356, 3984, 3805, 3985, # 6070
|
||||
7722, 1076, 49, 7723, 951, 3211, 3289, 3290, 450, 2837, 920, 7724, 1811, 2792, 2366, 4184, # 6086
|
||||
1908, 1138, 2367, 3806, 3462, 7725, 3212, 4470, 1909, 1147, 1518, 2423, 4471, 3807, 7726, 4472, # 6102
|
||||
2388, 2604, 260, 1795, 3213, 7727, 7728, 3808, 3291, 708, 7729, 3565, 1704, 7730, 3566, 1351, # 6118
|
||||
1618, 3357, 2999, 1886, 944, 4185, 3358, 4186, 3044, 3359, 4187, 7731, 3678, 422, 413, 1714, # 6134
|
||||
3292, 500, 2058, 2345, 4188, 2479, 7732, 1344, 1910, 954, 7733, 1668, 7734, 7735, 3986, 2404, # 6150
|
||||
4189, 3567, 3809, 4190, 7736, 2302, 1318, 2505, 3091, 133, 3092, 2873, 4473, 629, 31, 2838, # 6166
|
||||
2697, 3810, 4474, 850, 949, 4475, 3987, 2955, 1732, 2088, 4191, 1496, 1852, 7737, 3988, 620, # 6182
|
||||
3214, 981, 1242, 3679, 3360, 1619, 3680, 1643, 3293, 2139, 2452, 1970, 1719, 3463, 2168, 7738, # 6198
|
||||
3215, 7739, 7740, 3361, 1828, 7741, 1277, 4476, 1565, 2047, 7742, 1636, 3568, 3093, 7743, 869, # 6214
|
||||
2839, 655, 3811, 3812, 3094, 3989, 3000, 3813, 1310, 3569, 4477, 7744, 7745, 7746, 1733, 558, # 6230
|
||||
4478, 3681, 335, 1549, 3045, 1756, 4192, 3682, 1945, 3464, 1829, 1291, 1192, 470, 2726, 2107, # 6246
|
||||
2793, 913, 1054, 3990, 7747, 1027, 7748, 3046, 3991, 4479, 982, 2662, 3362, 3148, 3465, 3216, # 6262
|
||||
3217, 1946, 2794, 7749, 571, 4480, 7750, 1830, 7751, 3570, 2583, 1523, 2424, 7752, 2089, 984, # 6278
|
||||
4481, 3683, 1959, 7753, 3684, 852, 923, 2795, 3466, 3685, 969, 1519, 999, 2048, 2320, 1705, # 6294
|
||||
7754, 3095, 615, 1662, 151, 597, 3992, 2405, 2321, 1049, 275, 4482, 3686, 4193, 568, 3687, # 6310
|
||||
3571, 2480, 4194, 3688, 7755, 2425, 2270, 409, 3218, 7756, 1566, 2874, 3467, 1002, 769, 2840, # 6326
|
||||
194, 2090, 3149, 3689, 2222, 3294, 4195, 628, 1505, 7757, 7758, 1763, 2177, 3001, 3993, 521, # 6342
|
||||
1161, 2584, 1787, 2203, 2406, 4483, 3994, 1625, 4196, 4197, 412, 42, 3096, 464, 7759, 2632, # 6358
|
||||
4484, 3363, 1760, 1571, 2875, 3468, 2530, 1219, 2204, 3814, 2633, 2140, 2368, 4485, 4486, 3295, # 6374
|
||||
1651, 3364, 3572, 7760, 7761, 3573, 2481, 3469, 7762, 3690, 7763, 7764, 2271, 2091, 460, 7765, # 6390
|
||||
4487, 7766, 3002, 962, 588, 3574, 289, 3219, 2634, 1116, 52, 7767, 3047, 1796, 7768, 7769, # 6406
|
||||
7770, 1467, 7771, 1598, 1143, 3691, 4198, 1984, 1734, 1067, 4488, 1280, 3365, 465, 4489, 1572, # 6422
|
||||
510, 7772, 1927, 2241, 1812, 1644, 3575, 7773, 4490, 3692, 7774, 7775, 2663, 1573, 1534, 7776, # 6438
|
||||
7777, 4199, 536, 1807, 1761, 3470, 3815, 3150, 2635, 7778, 7779, 7780, 4491, 3471, 2915, 1911, # 6454
|
||||
2796, 7781, 3296, 1122, 377, 3220, 7782, 360, 7783, 7784, 4200, 1529, 551, 7785, 2059, 3693, # 6470
|
||||
1769, 2426, 7786, 2916, 4201, 3297, 3097, 2322, 2108, 2030, 4492, 1404, 136, 1468, 1479, 672, # 6486
|
||||
1171, 3221, 2303, 271, 3151, 7787, 2762, 7788, 2049, 678, 2727, 865, 1947, 4493, 7789, 2013, # 6502
|
||||
3995, 2956, 7790, 2728, 2223, 1397, 3048, 3694, 4494, 4495, 1735, 2917, 3366, 3576, 7791, 3816, # 6518
|
||||
509, 2841, 2453, 2876, 3817, 7792, 7793, 3152, 3153, 4496, 4202, 2531, 4497, 2304, 1166, 1010, # 6534
|
||||
552, 681, 1887, 7794, 7795, 2957, 2958, 3996, 1287, 1596, 1861, 3154, 358, 453, 736, 175, # 6550
|
||||
478, 1117, 905, 1167, 1097, 7796, 1853, 1530, 7797, 1706, 7798, 2178, 3472, 2287, 3695, 3473, # 6566
|
||||
3577, 4203, 2092, 4204, 7799, 3367, 1193, 2482, 4205, 1458, 2190, 2205, 1862, 1888, 1421, 3298, # 6582
|
||||
2918, 3049, 2179, 3474, 595, 2122, 7800, 3997, 7801, 7802, 4206, 1707, 2636, 223, 3696, 1359, # 6598
|
||||
751, 3098, 183, 3475, 7803, 2797, 3003, 419, 2369, 633, 704, 3818, 2389, 241, 7804, 7805, # 6614
|
||||
7806, 838, 3004, 3697, 2272, 2763, 2454, 3819, 1938, 2050, 3998, 1309, 3099, 2242, 1181, 7807, # 6630
|
||||
1136, 2206, 3820, 2370, 1446, 4207, 2305, 4498, 7808, 7809, 4208, 1055, 2605, 484, 3698, 7810, # 6646
|
||||
3999, 625, 4209, 2273, 3368, 1499, 4210, 4000, 7811, 4001, 4211, 3222, 2274, 2275, 3476, 7812, # 6662
|
||||
7813, 2764, 808, 2606, 3699, 3369, 4002, 4212, 3100, 2532, 526, 3370, 3821, 4213, 955, 7814, # 6678
|
||||
1620, 4214, 2637, 2427, 7815, 1429, 3700, 1669, 1831, 994, 928, 7816, 3578, 1260, 7817, 7818, # 6694
|
||||
7819, 1948, 2288, 741, 2919, 1626, 4215, 2729, 2455, 867, 1184, 362, 3371, 1392, 7820, 7821, # 6710
|
||||
4003, 4216, 1770, 1736, 3223, 2920, 4499, 4500, 1928, 2698, 1459, 1158, 7822, 3050, 3372, 2877, # 6726
|
||||
1292, 1929, 2506, 2842, 3701, 1985, 1187, 2071, 2014, 2607, 4217, 7823, 2566, 2507, 2169, 3702, # 6742
|
||||
2483, 3299, 7824, 3703, 4501, 7825, 7826, 666, 1003, 3005, 1022, 3579, 4218, 7827, 4502, 1813, # 6758
|
||||
2253, 574, 3822, 1603, 295, 1535, 705, 3823, 4219, 283, 858, 417, 7828, 7829, 3224, 4503, # 6774
|
||||
4504, 3051, 1220, 1889, 1046, 2276, 2456, 4004, 1393, 1599, 689, 2567, 388, 4220, 7830, 2484, # 6790
|
||||
802, 7831, 2798, 3824, 2060, 1405, 2254, 7832, 4505, 3825, 2109, 1052, 1345, 3225, 1585, 7833, # 6806
|
||||
809, 7834, 7835, 7836, 575, 2730, 3477, 956, 1552, 1469, 1144, 2323, 7837, 2324, 1560, 2457, # 6822
|
||||
3580, 3226, 4005, 616, 2207, 3155, 2180, 2289, 7838, 1832, 7839, 3478, 4506, 7840, 1319, 3704, # 6838
|
||||
3705, 1211, 3581, 1023, 3227, 1293, 2799, 7841, 7842, 7843, 3826, 607, 2306, 3827, 762, 2878, # 6854
|
||||
1439, 4221, 1360, 7844, 1485, 3052, 7845, 4507, 1038, 4222, 1450, 2061, 2638, 4223, 1379, 4508, # 6870
|
||||
2585, 7846, 7847, 4224, 1352, 1414, 2325, 2921, 1172, 7848, 7849, 3828, 3829, 7850, 1797, 1451, # 6886
|
||||
7851, 7852, 7853, 7854, 2922, 4006, 4007, 2485, 2346, 411, 4008, 4009, 3582, 3300, 3101, 4509, # 6902
|
||||
1561, 2664, 1452, 4010, 1375, 7855, 7856, 47, 2959, 316, 7857, 1406, 1591, 2923, 3156, 7858, # 6918
|
||||
1025, 2141, 3102, 3157, 354, 2731, 884, 2224, 4225, 2407, 508, 3706, 726, 3583, 996, 2428, # 6934
|
||||
3584, 729, 7859, 392, 2191, 1453, 4011, 4510, 3707, 7860, 7861, 2458, 3585, 2608, 1675, 2800, # 6950
|
||||
919, 2347, 2960, 2348, 1270, 4511, 4012, 73, 7862, 7863, 647, 7864, 3228, 2843, 2255, 1550, # 6966
|
||||
1346, 3006, 7865, 1332, 883, 3479, 7866, 7867, 7868, 7869, 3301, 2765, 7870, 1212, 831, 1347, # 6982
|
||||
4226, 4512, 2326, 3830, 1863, 3053, 720, 3831, 4513, 4514, 3832, 7871, 4227, 7872, 7873, 4515, # 6998
|
||||
7874, 7875, 1798, 4516, 3708, 2609, 4517, 3586, 1645, 2371, 7876, 7877, 2924, 669, 2208, 2665, # 7014
|
||||
2429, 7878, 2879, 7879, 7880, 1028, 3229, 7881, 4228, 2408, 7882, 2256, 1353, 7883, 7884, 4518, # 7030
|
||||
3158, 518, 7885, 4013, 7886, 4229, 1960, 7887, 2142, 4230, 7888, 7889, 3007, 2349, 2350, 3833, # 7046
|
||||
516, 1833, 1454, 4014, 2699, 4231, 4519, 2225, 2610, 1971, 1129, 3587, 7890, 2766, 7891, 2961, # 7062
|
||||
1422, 577, 1470, 3008, 1524, 3373, 7892, 7893, 432, 4232, 3054, 3480, 7894, 2586, 1455, 2508, # 7078
|
||||
2226, 1972, 1175, 7895, 1020, 2732, 4015, 3481, 4520, 7896, 2733, 7897, 1743, 1361, 3055, 3482, # 7094
|
||||
2639, 4016, 4233, 4521, 2290, 895, 924, 4234, 2170, 331, 2243, 3056, 166, 1627, 3057, 1098, # 7110
|
||||
7898, 1232, 2880, 2227, 3374, 4522, 657, 403, 1196, 2372, 542, 3709, 3375, 1600, 4235, 3483, # 7126
|
||||
7899, 4523, 2767, 3230, 576, 530, 1362, 7900, 4524, 2533, 2666, 3710, 4017, 7901, 842, 3834, # 7142
|
||||
7902, 2801, 2031, 1014, 4018, 213, 2700, 3376, 665, 621, 4236, 7903, 3711, 2925, 2430, 7904, # 7158
|
||||
2431, 3302, 3588, 3377, 7905, 4237, 2534, 4238, 4525, 3589, 1682, 4239, 3484, 1380, 7906, 724, # 7174
|
||||
2277, 600, 1670, 7907, 1337, 1233, 4526, 3103, 2244, 7908, 1621, 4527, 7909, 651, 4240, 7910, # 7190
|
||||
1612, 4241, 2611, 7911, 2844, 7912, 2734, 2307, 3058, 7913, 716, 2459, 3059, 174, 1255, 2701, # 7206
|
||||
4019, 3590, 548, 1320, 1398, 728, 4020, 1574, 7914, 1890, 1197, 3060, 4021, 7915, 3061, 3062, # 7222
|
||||
3712, 3591, 3713, 747, 7916, 635, 4242, 4528, 7917, 7918, 7919, 4243, 7920, 7921, 4529, 7922, # 7238
|
||||
3378, 4530, 2432, 451, 7923, 3714, 2535, 2072, 4244, 2735, 4245, 4022, 7924, 1764, 4531, 7925, # 7254
|
||||
4246, 350, 7926, 2278, 2390, 2486, 7927, 4247, 4023, 2245, 1434, 4024, 488, 4532, 458, 4248, # 7270
|
||||
4025, 3715, 771, 1330, 2391, 3835, 2568, 3159, 2159, 2409, 1553, 2667, 3160, 4249, 7928, 2487, # 7286
|
||||
2881, 2612, 1720, 2702, 4250, 3379, 4533, 7929, 2536, 4251, 7930, 3231, 4252, 2768, 7931, 2015, # 7302
|
||||
2736, 7932, 1155, 1017, 3716, 3836, 7933, 3303, 2308, 201, 1864, 4253, 1430, 7934, 4026, 7935, # 7318
|
||||
7936, 7937, 7938, 7939, 4254, 1604, 7940, 414, 1865, 371, 2587, 4534, 4535, 3485, 2016, 3104, # 7334
|
||||
4536, 1708, 960, 4255, 887, 389, 2171, 1536, 1663, 1721, 7941, 2228, 4027, 2351, 2926, 1580, # 7350
|
||||
7942, 7943, 7944, 1744, 7945, 2537, 4537, 4538, 7946, 4539, 7947, 2073, 7948, 7949, 3592, 3380, # 7366
|
||||
2882, 4256, 7950, 4257, 2640, 3381, 2802, 673, 2703, 2460, 709, 3486, 4028, 3593, 4258, 7951, # 7382
|
||||
1148, 502, 634, 7952, 7953, 1204, 4540, 3594, 1575, 4541, 2613, 3717, 7954, 3718, 3105, 948, # 7398
|
||||
3232, 121, 1745, 3837, 1110, 7955, 4259, 3063, 2509, 3009, 4029, 3719, 1151, 1771, 3838, 1488, # 7414
|
||||
4030, 1986, 7956, 2433, 3487, 7957, 7958, 2093, 7959, 4260, 3839, 1213, 1407, 2803, 531, 2737, # 7430
|
||||
2538, 3233, 1011, 1537, 7960, 2769, 4261, 3106, 1061, 7961, 3720, 3721, 1866, 2883, 7962, 2017, # 7446
|
||||
120, 4262, 4263, 2062, 3595, 3234, 2309, 3840, 2668, 3382, 1954, 4542, 7963, 7964, 3488, 1047, # 7462
|
||||
2704, 1266, 7965, 1368, 4543, 2845, 649, 3383, 3841, 2539, 2738, 1102, 2846, 2669, 7966, 7967, # 7478
|
||||
1999, 7968, 1111, 3596, 2962, 7969, 2488, 3842, 3597, 2804, 1854, 3384, 3722, 7970, 7971, 3385, # 7494
|
||||
2410, 2884, 3304, 3235, 3598, 7972, 2569, 7973, 3599, 2805, 4031, 1460, 856, 7974, 3600, 7975, # 7510
|
||||
2885, 2963, 7976, 2886, 3843, 7977, 4264, 632, 2510, 875, 3844, 1697, 3845, 2291, 7978, 7979, # 7526
|
||||
4544, 3010, 1239, 580, 4545, 4265, 7980, 914, 936, 2074, 1190, 4032, 1039, 2123, 7981, 7982, # 7542
|
||||
7983, 3386, 1473, 7984, 1354, 4266, 3846, 7985, 2172, 3064, 4033, 915, 3305, 4267, 4268, 3306, # 7558
|
||||
1605, 1834, 7986, 2739, 398, 3601, 4269, 3847, 4034, 328, 1912, 2847, 4035, 3848, 1331, 4270, # 7574
|
||||
3011, 937, 4271, 7987, 3602, 4036, 4037, 3387, 2160, 4546, 3388, 524, 742, 538, 3065, 1012, # 7590
|
||||
7988, 7989, 3849, 2461, 7990, 658, 1103, 225, 3850, 7991, 7992, 4547, 7993, 4548, 7994, 3236, # 7606
|
||||
1243, 7995, 4038, 963, 2246, 4549, 7996, 2705, 3603, 3161, 7997, 7998, 2588, 2327, 7999, 4550, # 7622
|
||||
8000, 8001, 8002, 3489, 3307, 957, 3389, 2540, 2032, 1930, 2927, 2462, 870, 2018, 3604, 1746, # 7638
|
||||
2770, 2771, 2434, 2463, 8003, 3851, 8004, 3723, 3107, 3724, 3490, 3390, 3725, 8005, 1179, 3066, # 7654
|
||||
8006, 3162, 2373, 4272, 3726, 2541, 3163, 3108, 2740, 4039, 8007, 3391, 1556, 2542, 2292, 977, # 7670
|
||||
2887, 2033, 4040, 1205, 3392, 8008, 1765, 3393, 3164, 2124, 1271, 1689, 714, 4551, 3491, 8009, # 7686
|
||||
2328, 3852, 533, 4273, 3605, 2181, 617, 8010, 2464, 3308, 3492, 2310, 8011, 8012, 3165, 8013, # 7702
|
||||
8014, 3853, 1987, 618, 427, 2641, 3493, 3394, 8015, 8016, 1244, 1690, 8017, 2806, 4274, 4552, # 7718
|
||||
8018, 3494, 8019, 8020, 2279, 1576, 473, 3606, 4275, 3395, 972, 8021, 3607, 8022, 3067, 8023, # 7734
|
||||
8024, 4553, 4554, 8025, 3727, 4041, 4042, 8026, 153, 4555, 356, 8027, 1891, 2888, 4276, 2143, # 7750
|
||||
408, 803, 2352, 8028, 3854, 8029, 4277, 1646, 2570, 2511, 4556, 4557, 3855, 8030, 3856, 4278, # 7766
|
||||
8031, 2411, 3396, 752, 8032, 8033, 1961, 2964, 8034, 746, 3012, 2465, 8035, 4279, 3728, 698, # 7782
|
||||
4558, 1892, 4280, 3608, 2543, 4559, 3609, 3857, 8036, 3166, 3397, 8037, 1823, 1302, 4043, 2706, # 7798
|
||||
3858, 1973, 4281, 8038, 4282, 3167, 823, 1303, 1288, 1236, 2848, 3495, 4044, 3398, 774, 3859, # 7814
|
||||
8039, 1581, 4560, 1304, 2849, 3860, 4561, 8040, 2435, 2161, 1083, 3237, 4283, 4045, 4284, 344, # 7830
|
||||
1173, 288, 2311, 454, 1683, 8041, 8042, 1461, 4562, 4046, 2589, 8043, 8044, 4563, 985, 894, # 7846
|
||||
8045, 3399, 3168, 8046, 1913, 2928, 3729, 1988, 8047, 2110, 1974, 8048, 4047, 8049, 2571, 1194, # 7862
|
||||
425, 8050, 4564, 3169, 1245, 3730, 4285, 8051, 8052, 2850, 8053, 636, 4565, 1855, 3861, 760, # 7878
|
||||
1799, 8054, 4286, 2209, 1508, 4566, 4048, 1893, 1684, 2293, 8055, 8056, 8057, 4287, 4288, 2210, # 7894
|
||||
479, 8058, 8059, 832, 8060, 4049, 2489, 8061, 2965, 2490, 3731, 990, 3109, 627, 1814, 2642, # 7910
|
||||
4289, 1582, 4290, 2125, 2111, 3496, 4567, 8062, 799, 4291, 3170, 8063, 4568, 2112, 1737, 3013, # 7926
|
||||
1018, 543, 754, 4292, 3309, 1676, 4569, 4570, 4050, 8064, 1489, 8065, 3497, 8066, 2614, 2889, # 7942
|
||||
4051, 8067, 8068, 2966, 8069, 8070, 8071, 8072, 3171, 4571, 4572, 2182, 1722, 8073, 3238, 3239, # 7958
|
||||
1842, 3610, 1715, 481, 365, 1975, 1856, 8074, 8075, 1962, 2491, 4573, 8076, 2126, 3611, 3240, # 7974
|
||||
433, 1894, 2063, 2075, 8077, 602, 2741, 8078, 8079, 8080, 8081, 8082, 3014, 1628, 3400, 8083, # 7990
|
||||
3172, 4574, 4052, 2890, 4575, 2512, 8084, 2544, 2772, 8085, 8086, 8087, 3310, 4576, 2891, 8088, # 8006
|
||||
4577, 8089, 2851, 4578, 4579, 1221, 2967, 4053, 2513, 8090, 8091, 8092, 1867, 1989, 8093, 8094, # 8022
|
||||
8095, 1895, 8096, 8097, 4580, 1896, 4054, 318, 8098, 2094, 4055, 4293, 8099, 8100, 485, 8101, # 8038
|
||||
938, 3862, 553, 2670, 116, 8102, 3863, 3612, 8103, 3498, 2671, 2773, 3401, 3311, 2807, 8104, # 8054
|
||||
3613, 2929, 4056, 1747, 2930, 2968, 8105, 8106, 207, 8107, 8108, 2672, 4581, 2514, 8109, 3015, # 8070
|
||||
890, 3614, 3864, 8110, 1877, 3732, 3402, 8111, 2183, 2353, 3403, 1652, 8112, 8113, 8114, 941, # 8086
|
||||
2294, 208, 3499, 4057, 2019, 330, 4294, 3865, 2892, 2492, 3733, 4295, 8115, 8116, 8117, 8118, # 8102
|
||||
)
|
||||
|
||||
# fmt: on
|
||||
|
||||
@@ -25,22 +25,23 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import EUCTWDistributionAnalysis
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .mbcssm import EUCTW_SM_MODEL
|
||||
|
||||
|
||||
class EUCTWProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
super(EUCTWProber, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
|
||||
self.distribution_analyzer = EUCTWDistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "EUC-TW"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Taiwan"
|
||||
|
||||
@@ -43,6 +43,7 @@ GB2312_TYPICAL_DISTRIBUTION_RATIO = 0.9
|
||||
|
||||
GB2312_TABLE_SIZE = 3760
|
||||
|
||||
# fmt: off
|
||||
GB2312_CHAR_TO_FREQ_ORDER = (
|
||||
1671, 749,1443,2364,3924,3807,2330,3921,1704,3463,2691,1511,1515, 572,3191,2205,
|
||||
2361, 224,2558, 479,1711, 963,3162, 440,4060,1905,2966,2947,3580,2647,3961,3842,
|
||||
@@ -280,4 +281,4 @@ GB2312_CHAR_TO_FREQ_ORDER = (
|
||||
381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189,
|
||||
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, #last 512
|
||||
)
|
||||
|
||||
# fmt: on
|
||||
|
||||
@@ -25,22 +25,23 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .chardistribution import GB2312DistributionAnalysis
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .mbcssm import GB2312_SM_MODEL
|
||||
|
||||
|
||||
class GB2312Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
super(GB2312Prober, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
|
||||
self.distribution_analyzer = GB2312DistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "GB2312"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Chinese"
|
||||
|
||||
@@ -25,8 +25,11 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState
|
||||
from .sbcharsetprober import SingleByteCharSetProber
|
||||
|
||||
# This prober doesn't actually recognize a language or a charset.
|
||||
# It is a helper prober for the use of the Hebrew model probers
|
||||
@@ -125,18 +128,20 @@ from .enums import ProbingState
|
||||
# model probers scores. The answer is returned in the form of the name of the
|
||||
# charset identified, either "windows-1255" or "ISO-8859-8".
|
||||
|
||||
|
||||
class HebrewProber(CharSetProber):
|
||||
SPACE = 0x20
|
||||
# windows-1255 / ISO-8859-8 code points of interest
|
||||
FINAL_KAF = 0xea
|
||||
NORMAL_KAF = 0xeb
|
||||
FINAL_MEM = 0xed
|
||||
NORMAL_MEM = 0xee
|
||||
FINAL_NUN = 0xef
|
||||
NORMAL_NUN = 0xf0
|
||||
FINAL_PE = 0xf3
|
||||
NORMAL_PE = 0xf4
|
||||
FINAL_TSADI = 0xf5
|
||||
NORMAL_TSADI = 0xf6
|
||||
FINAL_KAF = 0xEA
|
||||
NORMAL_KAF = 0xEB
|
||||
FINAL_MEM = 0xED
|
||||
NORMAL_MEM = 0xEE
|
||||
FINAL_NUN = 0xEF
|
||||
NORMAL_NUN = 0xF0
|
||||
FINAL_PE = 0xF3
|
||||
NORMAL_PE = 0xF4
|
||||
FINAL_TSADI = 0xF5
|
||||
NORMAL_TSADI = 0xF6
|
||||
|
||||
# Minimum Visual vs Logical final letter score difference.
|
||||
# If the difference is below this, don't rely solely on the final letter score
|
||||
@@ -151,35 +156,44 @@ class HebrewProber(CharSetProber):
|
||||
VISUAL_HEBREW_NAME = "ISO-8859-8"
|
||||
LOGICAL_HEBREW_NAME = "windows-1255"
|
||||
|
||||
def __init__(self):
|
||||
super(HebrewProber, self).__init__()
|
||||
self._final_char_logical_score = None
|
||||
self._final_char_visual_score = None
|
||||
self._prev = None
|
||||
self._before_prev = None
|
||||
self._logical_prober = None
|
||||
self._visual_prober = None
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._final_char_logical_score = 0
|
||||
self._final_char_visual_score = 0
|
||||
self._prev = self.SPACE
|
||||
self._before_prev = self.SPACE
|
||||
self._logical_prober: Optional[SingleByteCharSetProber] = None
|
||||
self._visual_prober: Optional[SingleByteCharSetProber] = None
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
self._final_char_logical_score = 0
|
||||
self._final_char_visual_score = 0
|
||||
# The two last characters seen in the previous buffer,
|
||||
# mPrev and mBeforePrev are initialized to space in order to simulate
|
||||
# a word delimiter at the beginning of the data
|
||||
self._prev = ' '
|
||||
self._before_prev = ' '
|
||||
self._prev = self.SPACE
|
||||
self._before_prev = self.SPACE
|
||||
# These probers are owned by the group prober.
|
||||
|
||||
def set_model_probers(self, logicalProber, visualProber):
|
||||
self._logical_prober = logicalProber
|
||||
self._visual_prober = visualProber
|
||||
def set_model_probers(
|
||||
self,
|
||||
logical_prober: SingleByteCharSetProber,
|
||||
visual_prober: SingleByteCharSetProber,
|
||||
) -> None:
|
||||
self._logical_prober = logical_prober
|
||||
self._visual_prober = visual_prober
|
||||
|
||||
def is_final(self, c):
|
||||
return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
|
||||
self.FINAL_PE, self.FINAL_TSADI]
|
||||
def is_final(self, c: int) -> bool:
|
||||
return c in [
|
||||
self.FINAL_KAF,
|
||||
self.FINAL_MEM,
|
||||
self.FINAL_NUN,
|
||||
self.FINAL_PE,
|
||||
self.FINAL_TSADI,
|
||||
]
|
||||
|
||||
def is_non_final(self, c):
|
||||
def is_non_final(self, c: int) -> bool:
|
||||
# The normal Tsadi is not a good Non-Final letter due to words like
|
||||
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||
# apostrophe is converted to a space in FilterWithoutEnglishLetters
|
||||
@@ -190,10 +204,9 @@ class HebrewProber(CharSetProber):
|
||||
# for example legally end with a Non-Final Pe or Kaf. However, the
|
||||
# benefit of these letters as Non-Final letters outweighs the damage
|
||||
# since these words are quite rare.
|
||||
return c in [self.NORMAL_KAF, self.NORMAL_MEM,
|
||||
self.NORMAL_NUN, self.NORMAL_PE]
|
||||
return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
# Final letter analysis for logical-visual decision.
|
||||
# Look for evidence that the received buffer is either logical Hebrew
|
||||
# or visual Hebrew.
|
||||
@@ -227,9 +240,9 @@ class HebrewProber(CharSetProber):
|
||||
byte_str = self.filter_high_byte_only(byte_str)
|
||||
|
||||
for cur in byte_str:
|
||||
if cur == ' ':
|
||||
if cur == self.SPACE:
|
||||
# We stand on a space - a word just ended
|
||||
if self._before_prev != ' ':
|
||||
if self._before_prev != self.SPACE:
|
||||
# next-to-last char was not a space so self._prev is not a
|
||||
# 1 letter word
|
||||
if self.is_final(self._prev):
|
||||
@@ -241,8 +254,11 @@ class HebrewProber(CharSetProber):
|
||||
self._final_char_visual_score += 1
|
||||
else:
|
||||
# Not standing on a space
|
||||
if ((self._before_prev == ' ') and
|
||||
(self.is_final(self._prev)) and (cur != ' ')):
|
||||
if (
|
||||
(self._before_prev == self.SPACE)
|
||||
and (self.is_final(self._prev))
|
||||
and (cur != self.SPACE)
|
||||
):
|
||||
# case (3) [-2:space][-1:final letter][cur:not space]
|
||||
self._final_char_visual_score += 1
|
||||
self._before_prev = self._prev
|
||||
@@ -253,7 +269,10 @@ class HebrewProber(CharSetProber):
|
||||
return ProbingState.DETECTING
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
assert self._logical_prober is not None
|
||||
assert self._visual_prober is not None
|
||||
|
||||
# Make the decision: is it Logical or Visual?
|
||||
# If the final letter score distance is dominant enough, rely on it.
|
||||
finalsub = self._final_char_logical_score - self._final_char_visual_score
|
||||
@@ -263,8 +282,9 @@ class HebrewProber(CharSetProber):
|
||||
return self.VISUAL_HEBREW_NAME
|
||||
|
||||
# It's not dominant enough, try to rely on the model scores instead.
|
||||
modelsub = (self._logical_prober.get_confidence()
|
||||
- self._visual_prober.get_confidence())
|
||||
modelsub = (
|
||||
self._logical_prober.get_confidence() - self._visual_prober.get_confidence()
|
||||
)
|
||||
if modelsub > self.MIN_MODEL_DISTANCE:
|
||||
return self.LOGICAL_HEBREW_NAME
|
||||
if modelsub < -self.MIN_MODEL_DISTANCE:
|
||||
@@ -280,13 +300,17 @@ class HebrewProber(CharSetProber):
|
||||
return self.LOGICAL_HEBREW_NAME
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
return 'Hebrew'
|
||||
def language(self) -> str:
|
||||
return "Hebrew"
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
def state(self) -> ProbingState:
|
||||
assert self._logical_prober is not None
|
||||
assert self._visual_prober is not None
|
||||
|
||||
# Remain active as long as any of the model probers are active.
|
||||
if (self._logical_prober.state == ProbingState.NOT_ME) and \
|
||||
(self._visual_prober.state == ProbingState.NOT_ME):
|
||||
if (self._logical_prober.state == ProbingState.NOT_ME) and (
|
||||
self._visual_prober.state == ProbingState.NOT_ME
|
||||
):
|
||||
return ProbingState.NOT_ME
|
||||
return ProbingState.DETECTING
|
||||
|
||||
@@ -46,6 +46,7 @@ JIS_TYPICAL_DISTRIBUTION_RATIO = 3.0
|
||||
# Char to FreqOrder table ,
|
||||
JIS_TABLE_SIZE = 4368
|
||||
|
||||
# fmt: off
|
||||
JIS_CHAR_TO_FREQ_ORDER = (
|
||||
40, 1, 6, 182, 152, 180, 295,2127, 285, 381,3295,4304,3068,4606,3165,3510, # 16
|
||||
3511,1822,2785,4607,1193,2226,5070,4608, 171,2996,1247, 18, 179,5071, 856,1661, # 32
|
||||
@@ -321,5 +322,4 @@ JIS_CHAR_TO_FREQ_ORDER = (
|
||||
1444,1698,2385,2251,3729,1365,2281,2235,1717,6188, 864,3841,2515, 444, 527,2767, # 4352
|
||||
2922,3625, 544, 461,6189, 566, 209,2437,3398,2098,1065,2068,3331,3626,3257,2137, # 4368 #last 512
|
||||
)
|
||||
|
||||
|
||||
# fmt: on
|
||||
|
||||
@@ -25,110 +25,114 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||
jp2CharContext = (
|
||||
(0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1),
|
||||
(2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4),
|
||||
(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2),
|
||||
(0,4,0,5,0,5,0,4,0,4,5,4,4,3,5,3,5,1,5,3,4,3,4,4,3,4,3,3,4,3,5,4,4,3,5,5,3,5,5,5,3,5,5,3,4,5,5,3,1,3,2,0,3,4,0,4,2,0,4,2,1,5,3,2,3,5,0,4,0,2,0,5,4,4,5,4,5,0,4,0,0,4,4),
|
||||
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
|
||||
(0,3,0,4,0,3,0,3,0,4,5,4,3,3,3,3,4,3,5,4,4,3,5,4,4,3,4,3,4,4,4,4,5,3,4,4,3,4,5,5,4,5,5,1,4,5,4,3,0,3,3,1,3,3,0,4,4,0,3,3,1,5,3,3,3,5,0,4,0,3,0,4,4,3,4,3,3,0,4,1,1,3,4),
|
||||
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
|
||||
(0,4,0,3,0,3,0,4,0,3,4,4,3,2,2,1,2,1,3,1,3,3,3,3,3,4,3,1,3,3,5,3,3,0,4,3,0,5,4,3,3,5,4,4,3,4,4,5,0,1,2,0,1,2,0,2,2,0,1,0,0,5,2,2,1,4,0,3,0,1,0,4,4,3,5,4,3,0,2,1,0,4,3),
|
||||
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
|
||||
(0,3,0,5,0,4,0,2,1,4,4,2,4,1,4,2,4,2,4,3,3,3,4,3,3,3,3,1,4,2,3,3,3,1,4,4,1,1,1,4,3,3,2,0,2,4,3,2,0,3,3,0,3,1,1,0,0,0,3,3,0,4,2,2,3,4,0,4,0,3,0,4,4,5,3,4,4,0,3,0,0,1,4),
|
||||
(1,4,0,4,0,4,0,4,0,3,5,4,4,3,4,3,5,4,3,3,4,3,5,4,4,4,4,3,4,2,4,3,3,1,5,4,3,2,4,5,4,5,5,4,4,5,4,4,0,3,2,2,3,3,0,4,3,1,3,2,1,4,3,3,4,5,0,3,0,2,0,4,5,5,4,5,4,0,4,0,0,5,4),
|
||||
(0,5,0,5,0,4,0,3,0,4,4,3,4,3,3,3,4,0,4,4,4,3,4,3,4,3,3,1,4,2,4,3,4,0,5,4,1,4,5,4,4,5,3,2,4,3,4,3,2,4,1,3,3,3,2,3,2,0,4,3,3,4,3,3,3,4,0,4,0,3,0,4,5,4,4,4,3,0,4,1,0,1,3),
|
||||
(0,3,1,4,0,3,0,2,0,3,4,4,3,1,4,2,3,3,4,3,4,3,4,3,4,4,3,2,3,1,5,4,4,1,4,4,3,5,4,4,3,5,5,4,3,4,4,3,1,2,3,1,2,2,0,3,2,0,3,1,0,5,3,3,3,4,3,3,3,3,4,4,4,4,5,4,2,0,3,3,2,4,3),
|
||||
(0,2,0,3,0,1,0,1,0,0,3,2,0,0,2,0,1,0,2,1,3,3,3,1,2,3,1,0,1,0,4,2,1,1,3,3,0,4,3,3,1,4,3,3,0,3,3,2,0,0,0,0,1,0,0,2,0,0,0,0,0,4,1,0,2,3,2,2,2,1,3,3,3,4,4,3,2,0,3,1,0,3,3),
|
||||
(0,4,0,4,0,3,0,3,0,4,4,4,3,3,3,3,3,3,4,3,4,2,4,3,4,3,3,2,4,3,4,5,4,1,4,5,3,5,4,5,3,5,4,0,3,5,5,3,1,3,3,2,2,3,0,3,4,1,3,3,2,4,3,3,3,4,0,4,0,3,0,4,5,4,4,5,3,0,4,1,0,3,4),
|
||||
(0,2,0,3,0,3,0,0,0,2,2,2,1,0,1,0,0,0,3,0,3,0,3,0,1,3,1,0,3,1,3,3,3,1,3,3,3,0,1,3,1,3,4,0,0,3,1,1,0,3,2,0,0,0,0,1,3,0,1,0,0,3,3,2,0,3,0,0,0,0,0,3,4,3,4,3,3,0,3,0,0,2,3),
|
||||
(2,3,0,3,0,2,0,1,0,3,3,4,3,1,3,1,1,1,3,1,4,3,4,3,3,3,0,0,3,1,5,4,3,1,4,3,2,5,5,4,4,4,4,3,3,4,4,4,0,2,1,1,3,2,0,1,2,0,0,1,0,4,1,3,3,3,0,3,0,1,0,4,4,4,5,5,3,0,2,0,0,4,4),
|
||||
(0,2,0,1,0,3,1,3,0,2,3,3,3,0,3,1,0,0,3,0,3,2,3,1,3,2,1,1,0,0,4,2,1,0,2,3,1,4,3,2,0,4,4,3,1,3,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,4,1,1,1,2,0,3,0,0,0,3,4,2,4,3,2,0,1,0,0,3,3),
|
||||
(0,1,0,4,0,5,0,4,0,2,4,4,2,3,3,2,3,3,5,3,3,3,4,3,4,2,3,0,4,3,3,3,4,1,4,3,2,1,5,5,3,4,5,1,3,5,4,2,0,3,3,0,1,3,0,4,2,0,1,3,1,4,3,3,3,3,0,3,0,1,0,3,4,4,4,5,5,0,3,0,1,4,5),
|
||||
(0,2,0,3,0,3,0,0,0,2,3,1,3,0,4,0,1,1,3,0,3,4,3,2,3,1,0,3,3,2,3,1,3,0,2,3,0,2,1,4,1,2,2,0,0,3,3,0,0,2,0,0,0,1,0,0,0,0,2,2,0,3,2,1,3,3,0,2,0,2,0,0,3,3,1,2,4,0,3,0,2,2,3),
|
||||
(2,4,0,5,0,4,0,4,0,2,4,4,4,3,4,3,3,3,1,2,4,3,4,3,4,4,5,0,3,3,3,3,2,0,4,3,1,4,3,4,1,4,4,3,3,4,4,3,1,2,3,0,4,2,0,4,1,0,3,3,0,4,3,3,3,4,0,4,0,2,0,3,5,3,4,5,2,0,3,0,0,4,5),
|
||||
(0,3,0,4,0,1,0,1,0,1,3,2,2,1,3,0,3,0,2,0,2,0,3,0,2,0,0,0,1,0,1,1,0,0,3,1,0,0,0,4,0,3,1,0,2,1,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,2,2,3,1,0,3,0,0,0,1,4,4,4,3,0,0,4,0,0,1,4),
|
||||
(1,4,1,5,0,3,0,3,0,4,5,4,4,3,5,3,3,4,4,3,4,1,3,3,3,3,2,1,4,1,5,4,3,1,4,4,3,5,4,4,3,5,4,3,3,4,4,4,0,3,3,1,2,3,0,3,1,0,3,3,0,5,4,4,4,4,4,4,3,3,5,4,4,3,3,5,4,0,3,2,0,4,4),
|
||||
(0,2,0,3,0,1,0,0,0,1,3,3,3,2,4,1,3,0,3,1,3,0,2,2,1,1,0,0,2,0,4,3,1,0,4,3,0,4,4,4,1,4,3,1,1,3,3,1,0,2,0,0,1,3,0,0,0,0,2,0,0,4,3,2,4,3,5,4,3,3,3,4,3,3,4,3,3,0,2,1,0,3,3),
|
||||
(0,2,0,4,0,3,0,2,0,2,5,5,3,4,4,4,4,1,4,3,3,0,4,3,4,3,1,3,3,2,4,3,0,3,4,3,0,3,4,4,2,4,4,0,4,5,3,3,2,2,1,1,1,2,0,1,5,0,3,3,2,4,3,3,3,4,0,3,0,2,0,4,4,3,5,5,0,0,3,0,2,3,3),
|
||||
(0,3,0,4,0,3,0,1,0,3,4,3,3,1,3,3,3,0,3,1,3,0,4,3,3,1,1,0,3,0,3,3,0,0,4,4,0,1,5,4,3,3,5,0,3,3,4,3,0,2,0,1,1,1,0,1,3,0,1,2,1,3,3,2,3,3,0,3,0,1,0,1,3,3,4,4,1,0,1,2,2,1,3),
|
||||
(0,1,0,4,0,4,0,3,0,1,3,3,3,2,3,1,1,0,3,0,3,3,4,3,2,4,2,0,1,0,4,3,2,0,4,3,0,5,3,3,2,4,4,4,3,3,3,4,0,1,3,0,0,1,0,0,1,0,0,0,0,4,2,3,3,3,0,3,0,0,0,4,4,4,5,3,2,0,3,3,0,3,5),
|
||||
(0,2,0,3,0,0,0,3,0,1,3,0,2,0,0,0,1,0,3,1,1,3,3,0,0,3,0,0,3,0,2,3,1,0,3,1,0,3,3,2,0,4,2,2,0,2,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,1,0,0,0,1,3,1,2,0,0,0,1,0,0,1,4),
|
||||
(0,3,0,3,0,5,0,1,0,2,4,3,1,3,3,2,1,1,5,2,1,0,5,1,2,0,0,0,3,3,2,2,3,2,4,3,0,0,3,3,1,3,3,0,2,5,3,4,0,3,3,0,1,2,0,2,2,0,3,2,0,2,2,3,3,3,0,2,0,1,0,3,4,4,2,5,4,0,3,0,0,3,5),
|
||||
(0,3,0,3,0,3,0,1,0,3,3,3,3,0,3,0,2,0,2,1,1,0,2,0,1,0,0,0,2,1,0,0,1,0,3,2,0,0,3,3,1,2,3,1,0,3,3,0,0,1,0,0,0,0,0,2,0,0,0,0,0,2,3,1,2,3,0,3,0,1,0,3,2,1,0,4,3,0,1,1,0,3,3),
|
||||
(0,4,0,5,0,3,0,3,0,4,5,5,4,3,5,3,4,3,5,3,3,2,5,3,4,4,4,3,4,3,4,5,5,3,4,4,3,4,4,5,4,4,4,3,4,5,5,4,2,3,4,2,3,4,0,3,3,1,4,3,2,4,3,3,5,5,0,3,0,3,0,5,5,5,5,4,4,0,4,0,1,4,4),
|
||||
(0,4,0,4,0,3,0,3,0,3,5,4,4,2,3,2,5,1,3,2,5,1,4,2,3,2,3,3,4,3,3,3,3,2,5,4,1,3,3,5,3,4,4,0,4,4,3,1,1,3,1,0,2,3,0,2,3,0,3,0,0,4,3,1,3,4,0,3,0,2,0,4,4,4,3,4,5,0,4,0,0,3,4),
|
||||
(0,3,0,3,0,3,1,2,0,3,4,4,3,3,3,0,2,2,4,3,3,1,3,3,3,1,1,0,3,1,4,3,2,3,4,4,2,4,4,4,3,4,4,3,2,4,4,3,1,3,3,1,3,3,0,4,1,0,2,2,1,4,3,2,3,3,5,4,3,3,5,4,4,3,3,0,4,0,3,2,2,4,4),
|
||||
(0,2,0,1,0,0,0,0,0,1,2,1,3,0,0,0,0,0,2,0,1,2,1,0,0,1,0,0,0,0,3,0,0,1,0,1,1,3,1,0,0,0,1,1,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,2,2,0,3,4,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1),
|
||||
(0,1,0,0,0,1,0,0,0,0,4,0,4,1,4,0,3,0,4,0,3,0,4,0,3,0,3,0,4,1,5,1,4,0,0,3,0,5,0,5,2,0,1,0,0,0,2,1,4,0,1,3,0,0,3,0,0,3,1,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0),
|
||||
(1,4,0,5,0,3,0,2,0,3,5,4,4,3,4,3,5,3,4,3,3,0,4,3,3,3,3,3,3,2,4,4,3,1,3,4,4,5,4,4,3,4,4,1,3,5,4,3,3,3,1,2,2,3,3,1,3,1,3,3,3,5,3,3,4,5,0,3,0,3,0,3,4,3,4,4,3,0,3,0,2,4,3),
|
||||
(0,1,0,4,0,0,0,0,0,1,4,0,4,1,4,2,4,0,3,0,1,0,1,0,0,0,0,0,2,0,3,1,1,1,0,3,0,0,0,1,2,1,0,0,1,1,1,1,0,1,0,0,0,1,0,0,3,0,0,0,0,3,2,0,2,2,0,1,0,0,0,2,3,2,3,3,0,0,0,0,2,1,0),
|
||||
(0,5,1,5,0,3,0,3,0,5,4,4,5,1,5,3,3,0,4,3,4,3,5,3,4,3,3,2,4,3,4,3,3,0,3,3,1,4,4,3,4,4,4,3,4,5,5,3,2,3,1,1,3,3,1,3,1,1,3,3,2,4,5,3,3,5,0,4,0,3,0,4,4,3,5,3,3,0,3,4,0,4,3),
|
||||
(0,5,0,5,0,3,0,2,0,4,4,3,5,2,4,3,3,3,4,4,4,3,5,3,5,3,3,1,4,0,4,3,3,0,3,3,0,4,4,4,4,5,4,3,3,5,5,3,2,3,1,2,3,2,0,1,0,0,3,2,2,4,4,3,1,5,0,4,0,3,0,4,3,1,3,2,1,0,3,3,0,3,3),
|
||||
(0,4,0,5,0,5,0,4,0,4,5,5,5,3,4,3,3,2,5,4,4,3,5,3,5,3,4,0,4,3,4,4,3,2,4,4,3,4,5,4,4,5,5,0,3,5,5,4,1,3,3,2,3,3,1,3,1,0,4,3,1,4,4,3,4,5,0,4,0,2,0,4,3,4,4,3,3,0,4,0,0,5,5),
|
||||
(0,4,0,4,0,5,0,1,1,3,3,4,4,3,4,1,3,0,5,1,3,0,3,1,3,1,1,0,3,0,3,3,4,0,4,3,0,4,4,4,3,4,4,0,3,5,4,1,0,3,0,0,2,3,0,3,1,0,3,1,0,3,2,1,3,5,0,3,0,1,0,3,2,3,3,4,4,0,2,2,0,4,4),
|
||||
(2,4,0,5,0,4,0,3,0,4,5,5,4,3,5,3,5,3,5,3,5,2,5,3,4,3,3,4,3,4,5,3,2,1,5,4,3,2,3,4,5,3,4,1,2,5,4,3,0,3,3,0,3,2,0,2,3,0,4,1,0,3,4,3,3,5,0,3,0,1,0,4,5,5,5,4,3,0,4,2,0,3,5),
|
||||
(0,5,0,4,0,4,0,2,0,5,4,3,4,3,4,3,3,3,4,3,4,2,5,3,5,3,4,1,4,3,4,4,4,0,3,5,0,4,4,4,4,5,3,1,3,4,5,3,3,3,3,3,3,3,0,2,2,0,3,3,2,4,3,3,3,5,3,4,1,3,3,5,3,2,0,0,0,0,4,3,1,3,3),
|
||||
(0,1,0,3,0,3,0,1,0,1,3,3,3,2,3,3,3,0,3,0,0,0,3,1,3,0,0,0,2,2,2,3,0,0,3,2,0,1,2,4,1,3,3,0,0,3,3,3,0,1,0,0,2,1,0,0,3,0,3,1,0,3,0,0,1,3,0,2,0,1,0,3,3,1,3,3,0,0,1,1,0,3,3),
|
||||
(0,2,0,3,0,2,1,4,0,2,2,3,1,1,3,1,1,0,2,0,3,1,2,3,1,3,0,0,1,0,4,3,2,3,3,3,1,4,2,3,3,3,3,1,0,3,1,4,0,1,1,0,1,2,0,1,1,0,1,1,0,3,1,3,2,2,0,1,0,0,0,2,3,3,3,1,0,0,0,0,0,2,3),
|
||||
(0,5,0,4,0,5,0,2,0,4,5,5,3,3,4,3,3,1,5,4,4,2,4,4,4,3,4,2,4,3,5,5,4,3,3,4,3,3,5,5,4,5,5,1,3,4,5,3,1,4,3,1,3,3,0,3,3,1,4,3,1,4,5,3,3,5,0,4,0,3,0,5,3,3,1,4,3,0,4,0,1,5,3),
|
||||
(0,5,0,5,0,4,0,2,0,4,4,3,4,3,3,3,3,3,5,4,4,4,4,4,4,5,3,3,5,2,4,4,4,3,4,4,3,3,4,4,5,5,3,3,4,3,4,3,3,4,3,3,3,3,1,2,2,1,4,3,3,5,4,4,3,4,0,4,0,3,0,4,4,4,4,4,1,0,4,2,0,2,4),
|
||||
(0,4,0,4,0,3,0,1,0,3,5,2,3,0,3,0,2,1,4,2,3,3,4,1,4,3,3,2,4,1,3,3,3,0,3,3,0,0,3,3,3,5,3,3,3,3,3,2,0,2,0,0,2,0,0,2,0,0,1,0,0,3,1,2,2,3,0,3,0,2,0,4,4,3,3,4,1,0,3,0,0,2,4),
|
||||
(0,0,0,4,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,1,0,2,0,1,0,0,0,0,0,3,1,3,0,3,2,0,0,0,1,0,3,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4,0,2,0,0,0,0,0,0,2),
|
||||
(0,2,1,3,0,2,0,2,0,3,3,3,3,1,3,1,3,3,3,3,3,3,4,2,2,1,2,1,4,0,4,3,1,3,3,3,2,4,3,5,4,3,3,3,3,3,3,3,0,1,3,0,2,0,0,1,0,0,1,0,0,4,2,0,2,3,0,3,3,0,3,3,4,2,3,1,4,0,1,2,0,2,3),
|
||||
(0,3,0,3,0,1,0,3,0,2,3,3,3,0,3,1,2,0,3,3,2,3,3,2,3,2,3,1,3,0,4,3,2,0,3,3,1,4,3,3,2,3,4,3,1,3,3,1,1,0,1,1,0,1,0,1,0,1,0,0,0,4,1,1,0,3,0,3,1,0,2,3,3,3,3,3,1,0,0,2,0,3,3),
|
||||
(0,0,0,0,0,0,0,0,0,0,3,0,2,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,3,0,3,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2,3,0,0,0,0,0,0,0,0,3),
|
||||
(0,2,0,3,1,3,0,3,0,2,3,3,3,1,3,1,3,1,3,1,3,3,3,1,3,0,2,3,1,1,4,3,3,2,3,3,1,2,2,4,1,3,3,0,1,4,2,3,0,1,3,0,3,0,0,1,3,0,2,0,0,3,3,2,1,3,0,3,0,2,0,3,4,4,4,3,1,0,3,0,0,3,3),
|
||||
(0,2,0,1,0,2,0,0,0,1,3,2,2,1,3,0,1,1,3,0,3,2,3,1,2,0,2,0,1,1,3,3,3,0,3,3,1,1,2,3,2,3,3,1,2,3,2,0,0,1,0,0,0,0,0,0,3,0,1,0,0,2,1,2,1,3,0,3,0,0,0,3,4,4,4,3,2,0,2,0,0,2,4),
|
||||
(0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,0,0,3),
|
||||
(0,3,0,3,0,2,0,3,0,3,3,3,2,3,2,2,2,0,3,1,3,3,3,2,3,3,0,0,3,0,3,2,2,0,2,3,1,4,3,4,3,3,2,3,1,5,4,4,0,3,1,2,1,3,0,3,1,1,2,0,2,3,1,3,1,3,0,3,0,1,0,3,3,4,4,2,1,0,2,1,0,2,4),
|
||||
(0,1,0,3,0,1,0,2,0,1,4,2,5,1,4,0,2,0,2,1,3,1,4,0,2,1,0,0,2,1,4,1,1,0,3,3,0,5,1,3,2,3,3,1,0,3,2,3,0,1,0,0,0,0,0,0,1,0,0,0,0,4,0,1,0,3,0,2,0,1,0,3,3,3,4,3,3,0,0,0,0,2,3),
|
||||
(0,0,0,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,3),
|
||||
(0,1,0,3,0,4,0,3,0,2,4,3,1,0,3,2,2,1,3,1,2,2,3,1,1,1,2,1,3,0,1,2,0,1,3,2,1,3,0,5,5,1,0,0,1,3,2,1,0,3,0,0,1,0,0,0,0,0,3,4,0,1,1,1,3,2,0,2,0,1,0,2,3,3,1,2,3,0,1,0,1,0,4),
|
||||
(0,0,0,1,0,3,0,3,0,2,2,1,0,0,4,0,3,0,3,1,3,0,3,0,3,0,1,0,3,0,3,1,3,0,3,3,0,0,1,2,1,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2,2,1,2,0,0,2,0,0,0,0,2,3,3,3,3,0,0,0,0,1,4),
|
||||
(0,0,0,3,0,3,0,0,0,0,3,1,1,0,3,0,1,0,2,0,1,0,0,0,0,0,0,0,1,0,3,0,2,0,2,3,0,0,2,2,3,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,2,3),
|
||||
(2,4,0,5,0,5,0,4,0,3,4,3,3,3,4,3,3,3,4,3,4,4,5,4,5,5,5,2,3,0,5,5,4,1,5,4,3,1,5,4,3,4,4,3,3,4,3,3,0,3,2,0,2,3,0,3,0,0,3,3,0,5,3,2,3,3,0,3,0,3,0,3,4,5,4,5,3,0,4,3,0,3,4),
|
||||
(0,3,0,3,0,3,0,3,0,3,3,4,3,2,3,2,3,0,4,3,3,3,3,3,3,3,3,0,3,2,4,3,3,1,3,4,3,4,4,4,3,4,4,3,2,4,4,1,0,2,0,0,1,1,0,2,0,0,3,1,0,5,3,2,1,3,0,3,0,1,2,4,3,2,4,3,3,0,3,2,0,4,4),
|
||||
(0,3,0,3,0,1,0,0,0,1,4,3,3,2,3,1,3,1,4,2,3,2,4,2,3,4,3,0,2,2,3,3,3,0,3,3,3,0,3,4,1,3,3,0,3,4,3,3,0,1,1,0,1,0,0,0,4,0,3,0,0,3,1,2,1,3,0,4,0,1,0,4,3,3,4,3,3,0,2,0,0,3,3),
|
||||
(0,3,0,4,0,1,0,3,0,3,4,3,3,0,3,3,3,1,3,1,3,3,4,3,3,3,0,0,3,1,5,3,3,1,3,3,2,5,4,3,3,4,5,3,2,5,3,4,0,1,0,0,0,0,0,2,0,0,1,1,0,4,2,2,1,3,0,3,0,2,0,4,4,3,5,3,2,0,1,1,0,3,4),
|
||||
(0,5,0,4,0,5,0,2,0,4,4,3,3,2,3,3,3,1,4,3,4,1,5,3,4,3,4,0,4,2,4,3,4,1,5,4,0,4,4,4,4,5,4,1,3,5,4,2,1,4,1,1,3,2,0,3,1,0,3,2,1,4,3,3,3,4,0,4,0,3,0,4,4,4,3,3,3,0,4,2,0,3,4),
|
||||
(1,4,0,4,0,3,0,1,0,3,3,3,1,1,3,3,2,2,3,3,1,0,3,2,2,1,2,0,3,1,2,1,2,0,3,2,0,2,2,3,3,4,3,0,3,3,1,2,0,1,1,3,1,2,0,0,3,0,1,1,0,3,2,2,3,3,0,3,0,0,0,2,3,3,4,3,3,0,1,0,0,1,4),
|
||||
(0,4,0,4,0,4,0,0,0,3,4,4,3,1,4,2,3,2,3,3,3,1,4,3,4,0,3,0,4,2,3,3,2,2,5,4,2,1,3,4,3,4,3,1,3,3,4,2,0,2,1,0,3,3,0,0,2,0,3,1,0,4,4,3,4,3,0,4,0,1,0,2,4,4,4,4,4,0,3,2,0,3,3),
|
||||
(0,0,0,1,0,4,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,3,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2),
|
||||
(0,2,0,3,0,4,0,4,0,1,3,3,3,0,4,0,2,1,2,1,1,1,2,0,3,1,1,0,1,0,3,1,0,0,3,3,2,0,1,1,0,0,0,0,0,1,0,2,0,2,2,0,3,1,0,0,1,0,1,1,0,1,2,0,3,0,0,0,0,1,0,0,3,3,4,3,1,0,1,0,3,0,2),
|
||||
(0,0,0,3,0,5,0,0,0,0,1,0,2,0,3,1,0,1,3,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,4,0,0,0,2,3,0,1,4,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,3),
|
||||
(0,2,0,5,0,5,0,1,0,2,4,3,3,2,5,1,3,2,3,3,3,0,4,1,2,0,3,0,4,0,2,2,1,1,5,3,0,0,1,4,2,3,2,0,3,3,3,2,0,2,4,1,1,2,0,1,1,0,3,1,0,1,3,1,2,3,0,2,0,0,0,1,3,5,4,4,4,0,3,0,0,1,3),
|
||||
(0,4,0,5,0,4,0,4,0,4,5,4,3,3,4,3,3,3,4,3,4,4,5,3,4,5,4,2,4,2,3,4,3,1,4,4,1,3,5,4,4,5,5,4,4,5,5,5,2,3,3,1,4,3,1,3,3,0,3,3,1,4,3,4,4,4,0,3,0,4,0,3,3,4,4,5,0,0,4,3,0,4,5),
|
||||
(0,4,0,4,0,3,0,3,0,3,4,4,4,3,3,2,4,3,4,3,4,3,5,3,4,3,2,1,4,2,4,4,3,1,3,4,2,4,5,5,3,4,5,4,1,5,4,3,0,3,2,2,3,2,1,3,1,0,3,3,3,5,3,3,3,5,4,4,2,3,3,4,3,3,3,2,1,0,3,2,1,4,3),
|
||||
(0,4,0,5,0,4,0,3,0,3,5,5,3,2,4,3,4,0,5,4,4,1,4,4,4,3,3,3,4,3,5,5,2,3,3,4,1,2,5,5,3,5,5,2,3,5,5,4,0,3,2,0,3,3,1,1,5,1,4,1,0,4,3,2,3,5,0,4,0,3,0,5,4,3,4,3,0,0,4,1,0,4,4),
|
||||
(1,3,0,4,0,2,0,2,0,2,5,5,3,3,3,3,3,0,4,2,3,4,4,4,3,4,0,0,3,4,5,4,3,3,3,3,2,5,5,4,5,5,5,4,3,5,5,5,1,3,1,0,1,0,0,3,2,0,4,2,0,5,2,3,2,4,1,3,0,3,0,4,5,4,5,4,3,0,4,2,0,5,4),
|
||||
(0,3,0,4,0,5,0,3,0,3,4,4,3,2,3,2,3,3,3,3,3,2,4,3,3,2,2,0,3,3,3,3,3,1,3,3,3,0,4,4,3,4,4,1,1,4,4,2,0,3,1,0,1,1,0,4,1,0,2,3,1,3,3,1,3,4,0,3,0,1,0,3,1,3,0,0,1,0,2,0,0,4,4),
|
||||
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
|
||||
(0,3,0,3,0,2,0,3,0,1,5,4,3,3,3,1,4,2,1,2,3,4,4,2,4,4,5,0,3,1,4,3,4,0,4,3,3,3,2,3,2,5,3,4,3,2,2,3,0,0,3,0,2,1,0,1,2,0,0,0,0,2,1,1,3,1,0,2,0,4,0,3,4,4,4,5,2,0,2,0,0,1,3),
|
||||
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,4,2,1,1,0,1,0,3,2,0,0,3,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,1,4,0,4,2,1,0,0,0,0,0,1),
|
||||
(0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,3,1,0,0,0,2,0,2,1,0,0,1,2,1,0,1,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,2),
|
||||
(0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3),
|
||||
(0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1),
|
||||
# fmt: off
|
||||
jp2_char_context = (
|
||||
(0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
|
||||
(2, 4, 0, 4, 0, 3, 0, 4, 0, 3, 4, 4, 4, 2, 4, 3, 3, 4, 3, 2, 3, 3, 4, 2, 3, 3, 3, 2, 4, 1, 4, 3, 3, 1, 5, 4, 3, 4, 3, 4, 3, 5, 3, 0, 3, 5, 4, 2, 0, 3, 1, 0, 3, 3, 0, 3, 3, 0, 1, 1, 0, 4, 3, 0, 3, 3, 0, 4, 0, 2, 0, 3, 5, 5, 5, 5, 4, 0, 4, 1, 0, 3, 4),
|
||||
(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2),
|
||||
(0, 4, 0, 5, 0, 5, 0, 4, 0, 4, 5, 4, 4, 3, 5, 3, 5, 1, 5, 3, 4, 3, 4, 4, 3, 4, 3, 3, 4, 3, 5, 4, 4, 3, 5, 5, 3, 5, 5, 5, 3, 5, 5, 3, 4, 5, 5, 3, 1, 3, 2, 0, 3, 4, 0, 4, 2, 0, 4, 2, 1, 5, 3, 2, 3, 5, 0, 4, 0, 2, 0, 5, 4, 4, 5, 4, 5, 0, 4, 0, 0, 4, 4),
|
||||
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
||||
(0, 3, 0, 4, 0, 3, 0, 3, 0, 4, 5, 4, 3, 3, 3, 3, 4, 3, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 4, 4, 4, 4, 5, 3, 4, 4, 3, 4, 5, 5, 4, 5, 5, 1, 4, 5, 4, 3, 0, 3, 3, 1, 3, 3, 0, 4, 4, 0, 3, 3, 1, 5, 3, 3, 3, 5, 0, 4, 0, 3, 0, 4, 4, 3, 4, 3, 3, 0, 4, 1, 1, 3, 4),
|
||||
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
||||
(0, 4, 0, 3, 0, 3, 0, 4, 0, 3, 4, 4, 3, 2, 2, 1, 2, 1, 3, 1, 3, 3, 3, 3, 3, 4, 3, 1, 3, 3, 5, 3, 3, 0, 4, 3, 0, 5, 4, 3, 3, 5, 4, 4, 3, 4, 4, 5, 0, 1, 2, 0, 1, 2, 0, 2, 2, 0, 1, 0, 0, 5, 2, 2, 1, 4, 0, 3, 0, 1, 0, 4, 4, 3, 5, 4, 3, 0, 2, 1, 0, 4, 3),
|
||||
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
||||
(0, 3, 0, 5, 0, 4, 0, 2, 1, 4, 4, 2, 4, 1, 4, 2, 4, 2, 4, 3, 3, 3, 4, 3, 3, 3, 3, 1, 4, 2, 3, 3, 3, 1, 4, 4, 1, 1, 1, 4, 3, 3, 2, 0, 2, 4, 3, 2, 0, 3, 3, 0, 3, 1, 1, 0, 0, 0, 3, 3, 0, 4, 2, 2, 3, 4, 0, 4, 0, 3, 0, 4, 4, 5, 3, 4, 4, 0, 3, 0, 0, 1, 4),
|
||||
(1, 4, 0, 4, 0, 4, 0, 4, 0, 3, 5, 4, 4, 3, 4, 3, 5, 4, 3, 3, 4, 3, 5, 4, 4, 4, 4, 3, 4, 2, 4, 3, 3, 1, 5, 4, 3, 2, 4, 5, 4, 5, 5, 4, 4, 5, 4, 4, 0, 3, 2, 2, 3, 3, 0, 4, 3, 1, 3, 2, 1, 4, 3, 3, 4, 5, 0, 3, 0, 2, 0, 4, 5, 5, 4, 5, 4, 0, 4, 0, 0, 5, 4),
|
||||
(0, 5, 0, 5, 0, 4, 0, 3, 0, 4, 4, 3, 4, 3, 3, 3, 4, 0, 4, 4, 4, 3, 4, 3, 4, 3, 3, 1, 4, 2, 4, 3, 4, 0, 5, 4, 1, 4, 5, 4, 4, 5, 3, 2, 4, 3, 4, 3, 2, 4, 1, 3, 3, 3, 2, 3, 2, 0, 4, 3, 3, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 5, 4, 4, 4, 3, 0, 4, 1, 0, 1, 3),
|
||||
(0, 3, 1, 4, 0, 3, 0, 2, 0, 3, 4, 4, 3, 1, 4, 2, 3, 3, 4, 3, 4, 3, 4, 3, 4, 4, 3, 2, 3, 1, 5, 4, 4, 1, 4, 4, 3, 5, 4, 4, 3, 5, 5, 4, 3, 4, 4, 3, 1, 2, 3, 1, 2, 2, 0, 3, 2, 0, 3, 1, 0, 5, 3, 3, 3, 4, 3, 3, 3, 3, 4, 4, 4, 4, 5, 4, 2, 0, 3, 3, 2, 4, 3),
|
||||
(0, 2, 0, 3, 0, 1, 0, 1, 0, 0, 3, 2, 0, 0, 2, 0, 1, 0, 2, 1, 3, 3, 3, 1, 2, 3, 1, 0, 1, 0, 4, 2, 1, 1, 3, 3, 0, 4, 3, 3, 1, 4, 3, 3, 0, 3, 3, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 4, 1, 0, 2, 3, 2, 2, 2, 1, 3, 3, 3, 4, 4, 3, 2, 0, 3, 1, 0, 3, 3),
|
||||
(0, 4, 0, 4, 0, 3, 0, 3, 0, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 3, 4, 2, 4, 3, 4, 3, 3, 2, 4, 3, 4, 5, 4, 1, 4, 5, 3, 5, 4, 5, 3, 5, 4, 0, 3, 5, 5, 3, 1, 3, 3, 2, 2, 3, 0, 3, 4, 1, 3, 3, 2, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 5, 4, 4, 5, 3, 0, 4, 1, 0, 3, 4),
|
||||
(0, 2, 0, 3, 0, 3, 0, 0, 0, 2, 2, 2, 1, 0, 1, 0, 0, 0, 3, 0, 3, 0, 3, 0, 1, 3, 1, 0, 3, 1, 3, 3, 3, 1, 3, 3, 3, 0, 1, 3, 1, 3, 4, 0, 0, 3, 1, 1, 0, 3, 2, 0, 0, 0, 0, 1, 3, 0, 1, 0, 0, 3, 3, 2, 0, 3, 0, 0, 0, 0, 0, 3, 4, 3, 4, 3, 3, 0, 3, 0, 0, 2, 3),
|
||||
(2, 3, 0, 3, 0, 2, 0, 1, 0, 3, 3, 4, 3, 1, 3, 1, 1, 1, 3, 1, 4, 3, 4, 3, 3, 3, 0, 0, 3, 1, 5, 4, 3, 1, 4, 3, 2, 5, 5, 4, 4, 4, 4, 3, 3, 4, 4, 4, 0, 2, 1, 1, 3, 2, 0, 1, 2, 0, 0, 1, 0, 4, 1, 3, 3, 3, 0, 3, 0, 1, 0, 4, 4, 4, 5, 5, 3, 0, 2, 0, 0, 4, 4),
|
||||
(0, 2, 0, 1, 0, 3, 1, 3, 0, 2, 3, 3, 3, 0, 3, 1, 0, 0, 3, 0, 3, 2, 3, 1, 3, 2, 1, 1, 0, 0, 4, 2, 1, 0, 2, 3, 1, 4, 3, 2, 0, 4, 4, 3, 1, 3, 1, 3, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 1, 1, 1, 2, 0, 3, 0, 0, 0, 3, 4, 2, 4, 3, 2, 0, 1, 0, 0, 3, 3),
|
||||
(0, 1, 0, 4, 0, 5, 0, 4, 0, 2, 4, 4, 2, 3, 3, 2, 3, 3, 5, 3, 3, 3, 4, 3, 4, 2, 3, 0, 4, 3, 3, 3, 4, 1, 4, 3, 2, 1, 5, 5, 3, 4, 5, 1, 3, 5, 4, 2, 0, 3, 3, 0, 1, 3, 0, 4, 2, 0, 1, 3, 1, 4, 3, 3, 3, 3, 0, 3, 0, 1, 0, 3, 4, 4, 4, 5, 5, 0, 3, 0, 1, 4, 5),
|
||||
(0, 2, 0, 3, 0, 3, 0, 0, 0, 2, 3, 1, 3, 0, 4, 0, 1, 1, 3, 0, 3, 4, 3, 2, 3, 1, 0, 3, 3, 2, 3, 1, 3, 0, 2, 3, 0, 2, 1, 4, 1, 2, 2, 0, 0, 3, 3, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 3, 2, 1, 3, 3, 0, 2, 0, 2, 0, 0, 3, 3, 1, 2, 4, 0, 3, 0, 2, 2, 3),
|
||||
(2, 4, 0, 5, 0, 4, 0, 4, 0, 2, 4, 4, 4, 3, 4, 3, 3, 3, 1, 2, 4, 3, 4, 3, 4, 4, 5, 0, 3, 3, 3, 3, 2, 0, 4, 3, 1, 4, 3, 4, 1, 4, 4, 3, 3, 4, 4, 3, 1, 2, 3, 0, 4, 2, 0, 4, 1, 0, 3, 3, 0, 4, 3, 3, 3, 4, 0, 4, 0, 2, 0, 3, 5, 3, 4, 5, 2, 0, 3, 0, 0, 4, 5),
|
||||
(0, 3, 0, 4, 0, 1, 0, 1, 0, 1, 3, 2, 2, 1, 3, 0, 3, 0, 2, 0, 2, 0, 3, 0, 2, 0, 0, 0, 1, 0, 1, 1, 0, 0, 3, 1, 0, 0, 0, 4, 0, 3, 1, 0, 2, 1, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 3, 1, 0, 3, 0, 0, 0, 1, 4, 4, 4, 3, 0, 0, 4, 0, 0, 1, 4),
|
||||
(1, 4, 1, 5, 0, 3, 0, 3, 0, 4, 5, 4, 4, 3, 5, 3, 3, 4, 4, 3, 4, 1, 3, 3, 3, 3, 2, 1, 4, 1, 5, 4, 3, 1, 4, 4, 3, 5, 4, 4, 3, 5, 4, 3, 3, 4, 4, 4, 0, 3, 3, 1, 2, 3, 0, 3, 1, 0, 3, 3, 0, 5, 4, 4, 4, 4, 4, 4, 3, 3, 5, 4, 4, 3, 3, 5, 4, 0, 3, 2, 0, 4, 4),
|
||||
(0, 2, 0, 3, 0, 1, 0, 0, 0, 1, 3, 3, 3, 2, 4, 1, 3, 0, 3, 1, 3, 0, 2, 2, 1, 1, 0, 0, 2, 0, 4, 3, 1, 0, 4, 3, 0, 4, 4, 4, 1, 4, 3, 1, 1, 3, 3, 1, 0, 2, 0, 0, 1, 3, 0, 0, 0, 0, 2, 0, 0, 4, 3, 2, 4, 3, 5, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 0, 2, 1, 0, 3, 3),
|
||||
(0, 2, 0, 4, 0, 3, 0, 2, 0, 2, 5, 5, 3, 4, 4, 4, 4, 1, 4, 3, 3, 0, 4, 3, 4, 3, 1, 3, 3, 2, 4, 3, 0, 3, 4, 3, 0, 3, 4, 4, 2, 4, 4, 0, 4, 5, 3, 3, 2, 2, 1, 1, 1, 2, 0, 1, 5, 0, 3, 3, 2, 4, 3, 3, 3, 4, 0, 3, 0, 2, 0, 4, 4, 3, 5, 5, 0, 0, 3, 0, 2, 3, 3),
|
||||
(0, 3, 0, 4, 0, 3, 0, 1, 0, 3, 4, 3, 3, 1, 3, 3, 3, 0, 3, 1, 3, 0, 4, 3, 3, 1, 1, 0, 3, 0, 3, 3, 0, 0, 4, 4, 0, 1, 5, 4, 3, 3, 5, 0, 3, 3, 4, 3, 0, 2, 0, 1, 1, 1, 0, 1, 3, 0, 1, 2, 1, 3, 3, 2, 3, 3, 0, 3, 0, 1, 0, 1, 3, 3, 4, 4, 1, 0, 1, 2, 2, 1, 3),
|
||||
(0, 1, 0, 4, 0, 4, 0, 3, 0, 1, 3, 3, 3, 2, 3, 1, 1, 0, 3, 0, 3, 3, 4, 3, 2, 4, 2, 0, 1, 0, 4, 3, 2, 0, 4, 3, 0, 5, 3, 3, 2, 4, 4, 4, 3, 3, 3, 4, 0, 1, 3, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 4, 2, 3, 3, 3, 0, 3, 0, 0, 0, 4, 4, 4, 5, 3, 2, 0, 3, 3, 0, 3, 5),
|
||||
(0, 2, 0, 3, 0, 0, 0, 3, 0, 1, 3, 0, 2, 0, 0, 0, 1, 0, 3, 1, 1, 3, 3, 0, 0, 3, 0, 0, 3, 0, 2, 3, 1, 0, 3, 1, 0, 3, 3, 2, 0, 4, 2, 2, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 0, 1, 0, 1, 0, 0, 0, 1, 3, 1, 2, 0, 0, 0, 1, 0, 0, 1, 4),
|
||||
(0, 3, 0, 3, 0, 5, 0, 1, 0, 2, 4, 3, 1, 3, 3, 2, 1, 1, 5, 2, 1, 0, 5, 1, 2, 0, 0, 0, 3, 3, 2, 2, 3, 2, 4, 3, 0, 0, 3, 3, 1, 3, 3, 0, 2, 5, 3, 4, 0, 3, 3, 0, 1, 2, 0, 2, 2, 0, 3, 2, 0, 2, 2, 3, 3, 3, 0, 2, 0, 1, 0, 3, 4, 4, 2, 5, 4, 0, 3, 0, 0, 3, 5),
|
||||
(0, 3, 0, 3, 0, 3, 0, 1, 0, 3, 3, 3, 3, 0, 3, 0, 2, 0, 2, 1, 1, 0, 2, 0, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 3, 2, 0, 0, 3, 3, 1, 2, 3, 1, 0, 3, 3, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 3, 1, 2, 3, 0, 3, 0, 1, 0, 3, 2, 1, 0, 4, 3, 0, 1, 1, 0, 3, 3),
|
||||
(0, 4, 0, 5, 0, 3, 0, 3, 0, 4, 5, 5, 4, 3, 5, 3, 4, 3, 5, 3, 3, 2, 5, 3, 4, 4, 4, 3, 4, 3, 4, 5, 5, 3, 4, 4, 3, 4, 4, 5, 4, 4, 4, 3, 4, 5, 5, 4, 2, 3, 4, 2, 3, 4, 0, 3, 3, 1, 4, 3, 2, 4, 3, 3, 5, 5, 0, 3, 0, 3, 0, 5, 5, 5, 5, 4, 4, 0, 4, 0, 1, 4, 4),
|
||||
(0, 4, 0, 4, 0, 3, 0, 3, 0, 3, 5, 4, 4, 2, 3, 2, 5, 1, 3, 2, 5, 1, 4, 2, 3, 2, 3, 3, 4, 3, 3, 3, 3, 2, 5, 4, 1, 3, 3, 5, 3, 4, 4, 0, 4, 4, 3, 1, 1, 3, 1, 0, 2, 3, 0, 2, 3, 0, 3, 0, 0, 4, 3, 1, 3, 4, 0, 3, 0, 2, 0, 4, 4, 4, 3, 4, 5, 0, 4, 0, 0, 3, 4),
|
||||
(0, 3, 0, 3, 0, 3, 1, 2, 0, 3, 4, 4, 3, 3, 3, 0, 2, 2, 4, 3, 3, 1, 3, 3, 3, 1, 1, 0, 3, 1, 4, 3, 2, 3, 4, 4, 2, 4, 4, 4, 3, 4, 4, 3, 2, 4, 4, 3, 1, 3, 3, 1, 3, 3, 0, 4, 1, 0, 2, 2, 1, 4, 3, 2, 3, 3, 5, 4, 3, 3, 5, 4, 4, 3, 3, 0, 4, 0, 3, 2, 2, 4, 4),
|
||||
(0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 2, 1, 3, 0, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 1, 3, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 3, 4, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1),
|
||||
(0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 4, 1, 4, 0, 3, 0, 4, 0, 3, 0, 4, 0, 3, 0, 3, 0, 4, 1, 5, 1, 4, 0, 0, 3, 0, 5, 0, 5, 2, 0, 1, 0, 0, 0, 2, 1, 4, 0, 1, 3, 0, 0, 3, 0, 0, 3, 1, 1, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0),
|
||||
(1, 4, 0, 5, 0, 3, 0, 2, 0, 3, 5, 4, 4, 3, 4, 3, 5, 3, 4, 3, 3, 0, 4, 3, 3, 3, 3, 3, 3, 2, 4, 4, 3, 1, 3, 4, 4, 5, 4, 4, 3, 4, 4, 1, 3, 5, 4, 3, 3, 3, 1, 2, 2, 3, 3, 1, 3, 1, 3, 3, 3, 5, 3, 3, 4, 5, 0, 3, 0, 3, 0, 3, 4, 3, 4, 4, 3, 0, 3, 0, 2, 4, 3),
|
||||
(0, 1, 0, 4, 0, 0, 0, 0, 0, 1, 4, 0, 4, 1, 4, 2, 4, 0, 3, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 3, 1, 1, 1, 0, 3, 0, 0, 0, 1, 2, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 3, 2, 0, 2, 2, 0, 1, 0, 0, 0, 2, 3, 2, 3, 3, 0, 0, 0, 0, 2, 1, 0),
|
||||
(0, 5, 1, 5, 0, 3, 0, 3, 0, 5, 4, 4, 5, 1, 5, 3, 3, 0, 4, 3, 4, 3, 5, 3, 4, 3, 3, 2, 4, 3, 4, 3, 3, 0, 3, 3, 1, 4, 4, 3, 4, 4, 4, 3, 4, 5, 5, 3, 2, 3, 1, 1, 3, 3, 1, 3, 1, 1, 3, 3, 2, 4, 5, 3, 3, 5, 0, 4, 0, 3, 0, 4, 4, 3, 5, 3, 3, 0, 3, 4, 0, 4, 3),
|
||||
(0, 5, 0, 5, 0, 3, 0, 2, 0, 4, 4, 3, 5, 2, 4, 3, 3, 3, 4, 4, 4, 3, 5, 3, 5, 3, 3, 1, 4, 0, 4, 3, 3, 0, 3, 3, 0, 4, 4, 4, 4, 5, 4, 3, 3, 5, 5, 3, 2, 3, 1, 2, 3, 2, 0, 1, 0, 0, 3, 2, 2, 4, 4, 3, 1, 5, 0, 4, 0, 3, 0, 4, 3, 1, 3, 2, 1, 0, 3, 3, 0, 3, 3),
|
||||
(0, 4, 0, 5, 0, 5, 0, 4, 0, 4, 5, 5, 5, 3, 4, 3, 3, 2, 5, 4, 4, 3, 5, 3, 5, 3, 4, 0, 4, 3, 4, 4, 3, 2, 4, 4, 3, 4, 5, 4, 4, 5, 5, 0, 3, 5, 5, 4, 1, 3, 3, 2, 3, 3, 1, 3, 1, 0, 4, 3, 1, 4, 4, 3, 4, 5, 0, 4, 0, 2, 0, 4, 3, 4, 4, 3, 3, 0, 4, 0, 0, 5, 5),
|
||||
(0, 4, 0, 4, 0, 5, 0, 1, 1, 3, 3, 4, 4, 3, 4, 1, 3, 0, 5, 1, 3, 0, 3, 1, 3, 1, 1, 0, 3, 0, 3, 3, 4, 0, 4, 3, 0, 4, 4, 4, 3, 4, 4, 0, 3, 5, 4, 1, 0, 3, 0, 0, 2, 3, 0, 3, 1, 0, 3, 1, 0, 3, 2, 1, 3, 5, 0, 3, 0, 1, 0, 3, 2, 3, 3, 4, 4, 0, 2, 2, 0, 4, 4),
|
||||
(2, 4, 0, 5, 0, 4, 0, 3, 0, 4, 5, 5, 4, 3, 5, 3, 5, 3, 5, 3, 5, 2, 5, 3, 4, 3, 3, 4, 3, 4, 5, 3, 2, 1, 5, 4, 3, 2, 3, 4, 5, 3, 4, 1, 2, 5, 4, 3, 0, 3, 3, 0, 3, 2, 0, 2, 3, 0, 4, 1, 0, 3, 4, 3, 3, 5, 0, 3, 0, 1, 0, 4, 5, 5, 5, 4, 3, 0, 4, 2, 0, 3, 5),
|
||||
(0, 5, 0, 4, 0, 4, 0, 2, 0, 5, 4, 3, 4, 3, 4, 3, 3, 3, 4, 3, 4, 2, 5, 3, 5, 3, 4, 1, 4, 3, 4, 4, 4, 0, 3, 5, 0, 4, 4, 4, 4, 5, 3, 1, 3, 4, 5, 3, 3, 3, 3, 3, 3, 3, 0, 2, 2, 0, 3, 3, 2, 4, 3, 3, 3, 5, 3, 4, 1, 3, 3, 5, 3, 2, 0, 0, 0, 0, 4, 3, 1, 3, 3),
|
||||
(0, 1, 0, 3, 0, 3, 0, 1, 0, 1, 3, 3, 3, 2, 3, 3, 3, 0, 3, 0, 0, 0, 3, 1, 3, 0, 0, 0, 2, 2, 2, 3, 0, 0, 3, 2, 0, 1, 2, 4, 1, 3, 3, 0, 0, 3, 3, 3, 0, 1, 0, 0, 2, 1, 0, 0, 3, 0, 3, 1, 0, 3, 0, 0, 1, 3, 0, 2, 0, 1, 0, 3, 3, 1, 3, 3, 0, 0, 1, 1, 0, 3, 3),
|
||||
(0, 2, 0, 3, 0, 2, 1, 4, 0, 2, 2, 3, 1, 1, 3, 1, 1, 0, 2, 0, 3, 1, 2, 3, 1, 3, 0, 0, 1, 0, 4, 3, 2, 3, 3, 3, 1, 4, 2, 3, 3, 3, 3, 1, 0, 3, 1, 4, 0, 1, 1, 0, 1, 2, 0, 1, 1, 0, 1, 1, 0, 3, 1, 3, 2, 2, 0, 1, 0, 0, 0, 2, 3, 3, 3, 1, 0, 0, 0, 0, 0, 2, 3),
|
||||
(0, 5, 0, 4, 0, 5, 0, 2, 0, 4, 5, 5, 3, 3, 4, 3, 3, 1, 5, 4, 4, 2, 4, 4, 4, 3, 4, 2, 4, 3, 5, 5, 4, 3, 3, 4, 3, 3, 5, 5, 4, 5, 5, 1, 3, 4, 5, 3, 1, 4, 3, 1, 3, 3, 0, 3, 3, 1, 4, 3, 1, 4, 5, 3, 3, 5, 0, 4, 0, 3, 0, 5, 3, 3, 1, 4, 3, 0, 4, 0, 1, 5, 3),
|
||||
(0, 5, 0, 5, 0, 4, 0, 2, 0, 4, 4, 3, 4, 3, 3, 3, 3, 3, 5, 4, 4, 4, 4, 4, 4, 5, 3, 3, 5, 2, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4, 5, 5, 3, 3, 4, 3, 4, 3, 3, 4, 3, 3, 3, 3, 1, 2, 2, 1, 4, 3, 3, 5, 4, 4, 3, 4, 0, 4, 0, 3, 0, 4, 4, 4, 4, 4, 1, 0, 4, 2, 0, 2, 4),
|
||||
(0, 4, 0, 4, 0, 3, 0, 1, 0, 3, 5, 2, 3, 0, 3, 0, 2, 1, 4, 2, 3, 3, 4, 1, 4, 3, 3, 2, 4, 1, 3, 3, 3, 0, 3, 3, 0, 0, 3, 3, 3, 5, 3, 3, 3, 3, 3, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 1, 0, 0, 3, 1, 2, 2, 3, 0, 3, 0, 2, 0, 4, 4, 3, 3, 4, 1, 0, 3, 0, 0, 2, 4),
|
||||
(0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 3, 1, 3, 0, 3, 2, 0, 0, 0, 1, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 2, 0, 0, 0, 0, 0, 0, 2),
|
||||
(0, 2, 1, 3, 0, 2, 0, 2, 0, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3, 4, 2, 2, 1, 2, 1, 4, 0, 4, 3, 1, 3, 3, 3, 2, 4, 3, 5, 4, 3, 3, 3, 3, 3, 3, 3, 0, 1, 3, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 4, 2, 0, 2, 3, 0, 3, 3, 0, 3, 3, 4, 2, 3, 1, 4, 0, 1, 2, 0, 2, 3),
|
||||
(0, 3, 0, 3, 0, 1, 0, 3, 0, 2, 3, 3, 3, 0, 3, 1, 2, 0, 3, 3, 2, 3, 3, 2, 3, 2, 3, 1, 3, 0, 4, 3, 2, 0, 3, 3, 1, 4, 3, 3, 2, 3, 4, 3, 1, 3, 3, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 4, 1, 1, 0, 3, 0, 3, 1, 0, 2, 3, 3, 3, 3, 3, 1, 0, 0, 2, 0, 3, 3),
|
||||
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 3, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3),
|
||||
(0, 2, 0, 3, 1, 3, 0, 3, 0, 2, 3, 3, 3, 1, 3, 1, 3, 1, 3, 1, 3, 3, 3, 1, 3, 0, 2, 3, 1, 1, 4, 3, 3, 2, 3, 3, 1, 2, 2, 4, 1, 3, 3, 0, 1, 4, 2, 3, 0, 1, 3, 0, 3, 0, 0, 1, 3, 0, 2, 0, 0, 3, 3, 2, 1, 3, 0, 3, 0, 2, 0, 3, 4, 4, 4, 3, 1, 0, 3, 0, 0, 3, 3),
|
||||
(0, 2, 0, 1, 0, 2, 0, 0, 0, 1, 3, 2, 2, 1, 3, 0, 1, 1, 3, 0, 3, 2, 3, 1, 2, 0, 2, 0, 1, 1, 3, 3, 3, 0, 3, 3, 1, 1, 2, 3, 2, 3, 3, 1, 2, 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 2, 1, 2, 1, 3, 0, 3, 0, 0, 0, 3, 4, 4, 4, 3, 2, 0, 2, 0, 0, 2, 4),
|
||||
(0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 3),
|
||||
(0, 3, 0, 3, 0, 2, 0, 3, 0, 3, 3, 3, 2, 3, 2, 2, 2, 0, 3, 1, 3, 3, 3, 2, 3, 3, 0, 0, 3, 0, 3, 2, 2, 0, 2, 3, 1, 4, 3, 4, 3, 3, 2, 3, 1, 5, 4, 4, 0, 3, 1, 2, 1, 3, 0, 3, 1, 1, 2, 0, 2, 3, 1, 3, 1, 3, 0, 3, 0, 1, 0, 3, 3, 4, 4, 2, 1, 0, 2, 1, 0, 2, 4),
|
||||
(0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 4, 2, 5, 1, 4, 0, 2, 0, 2, 1, 3, 1, 4, 0, 2, 1, 0, 0, 2, 1, 4, 1, 1, 0, 3, 3, 0, 5, 1, 3, 2, 3, 3, 1, 0, 3, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 1, 0, 3, 0, 2, 0, 1, 0, 3, 3, 3, 4, 3, 3, 0, 0, 0, 0, 2, 3),
|
||||
(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 3),
|
||||
(0, 1, 0, 3, 0, 4, 0, 3, 0, 2, 4, 3, 1, 0, 3, 2, 2, 1, 3, 1, 2, 2, 3, 1, 1, 1, 2, 1, 3, 0, 1, 2, 0, 1, 3, 2, 1, 3, 0, 5, 5, 1, 0, 0, 1, 3, 2, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 3, 4, 0, 1, 1, 1, 3, 2, 0, 2, 0, 1, 0, 2, 3, 3, 1, 2, 3, 0, 1, 0, 1, 0, 4),
|
||||
(0, 0, 0, 1, 0, 3, 0, 3, 0, 2, 2, 1, 0, 0, 4, 0, 3, 0, 3, 1, 3, 0, 3, 0, 3, 0, 1, 0, 3, 0, 3, 1, 3, 0, 3, 3, 0, 0, 1, 2, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 2, 0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 0, 0, 0, 1, 4),
|
||||
(0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 2, 0, 2, 3, 0, 0, 2, 2, 3, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 2, 0, 0, 0, 0, 2, 3),
|
||||
(2, 4, 0, 5, 0, 5, 0, 4, 0, 3, 4, 3, 3, 3, 4, 3, 3, 3, 4, 3, 4, 4, 5, 4, 5, 5, 5, 2, 3, 0, 5, 5, 4, 1, 5, 4, 3, 1, 5, 4, 3, 4, 4, 3, 3, 4, 3, 3, 0, 3, 2, 0, 2, 3, 0, 3, 0, 0, 3, 3, 0, 5, 3, 2, 3, 3, 0, 3, 0, 3, 0, 3, 4, 5, 4, 5, 3, 0, 4, 3, 0, 3, 4),
|
||||
(0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 3, 4, 3, 2, 3, 2, 3, 0, 4, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 2, 4, 3, 3, 1, 3, 4, 3, 4, 4, 4, 3, 4, 4, 3, 2, 4, 4, 1, 0, 2, 0, 0, 1, 1, 0, 2, 0, 0, 3, 1, 0, 5, 3, 2, 1, 3, 0, 3, 0, 1, 2, 4, 3, 2, 4, 3, 3, 0, 3, 2, 0, 4, 4),
|
||||
(0, 3, 0, 3, 0, 1, 0, 0, 0, 1, 4, 3, 3, 2, 3, 1, 3, 1, 4, 2, 3, 2, 4, 2, 3, 4, 3, 0, 2, 2, 3, 3, 3, 0, 3, 3, 3, 0, 3, 4, 1, 3, 3, 0, 3, 4, 3, 3, 0, 1, 1, 0, 1, 0, 0, 0, 4, 0, 3, 0, 0, 3, 1, 2, 1, 3, 0, 4, 0, 1, 0, 4, 3, 3, 4, 3, 3, 0, 2, 0, 0, 3, 3),
|
||||
(0, 3, 0, 4, 0, 1, 0, 3, 0, 3, 4, 3, 3, 0, 3, 3, 3, 1, 3, 1, 3, 3, 4, 3, 3, 3, 0, 0, 3, 1, 5, 3, 3, 1, 3, 3, 2, 5, 4, 3, 3, 4, 5, 3, 2, 5, 3, 4, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 0, 4, 2, 2, 1, 3, 0, 3, 0, 2, 0, 4, 4, 3, 5, 3, 2, 0, 1, 1, 0, 3, 4),
|
||||
(0, 5, 0, 4, 0, 5, 0, 2, 0, 4, 4, 3, 3, 2, 3, 3, 3, 1, 4, 3, 4, 1, 5, 3, 4, 3, 4, 0, 4, 2, 4, 3, 4, 1, 5, 4, 0, 4, 4, 4, 4, 5, 4, 1, 3, 5, 4, 2, 1, 4, 1, 1, 3, 2, 0, 3, 1, 0, 3, 2, 1, 4, 3, 3, 3, 4, 0, 4, 0, 3, 0, 4, 4, 4, 3, 3, 3, 0, 4, 2, 0, 3, 4),
|
||||
(1, 4, 0, 4, 0, 3, 0, 1, 0, 3, 3, 3, 1, 1, 3, 3, 2, 2, 3, 3, 1, 0, 3, 2, 2, 1, 2, 0, 3, 1, 2, 1, 2, 0, 3, 2, 0, 2, 2, 3, 3, 4, 3, 0, 3, 3, 1, 2, 0, 1, 1, 3, 1, 2, 0, 0, 3, 0, 1, 1, 0, 3, 2, 2, 3, 3, 0, 3, 0, 0, 0, 2, 3, 3, 4, 3, 3, 0, 1, 0, 0, 1, 4),
|
||||
(0, 4, 0, 4, 0, 4, 0, 0, 0, 3, 4, 4, 3, 1, 4, 2, 3, 2, 3, 3, 3, 1, 4, 3, 4, 0, 3, 0, 4, 2, 3, 3, 2, 2, 5, 4, 2, 1, 3, 4, 3, 4, 3, 1, 3, 3, 4, 2, 0, 2, 1, 0, 3, 3, 0, 0, 2, 0, 3, 1, 0, 4, 4, 3, 4, 3, 0, 4, 0, 1, 0, 2, 4, 4, 4, 4, 4, 0, 3, 2, 0, 3, 3),
|
||||
(0, 0, 0, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2),
|
||||
(0, 2, 0, 3, 0, 4, 0, 4, 0, 1, 3, 3, 3, 0, 4, 0, 2, 1, 2, 1, 1, 1, 2, 0, 3, 1, 1, 0, 1, 0, 3, 1, 0, 0, 3, 3, 2, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 2, 2, 0, 3, 1, 0, 0, 1, 0, 1, 1, 0, 1, 2, 0, 3, 0, 0, 0, 0, 1, 0, 0, 3, 3, 4, 3, 1, 0, 1, 0, 3, 0, 2),
|
||||
(0, 0, 0, 3, 0, 5, 0, 0, 0, 0, 1, 0, 2, 0, 3, 1, 0, 1, 3, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 4, 0, 0, 0, 2, 3, 0, 1, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 3),
|
||||
(0, 2, 0, 5, 0, 5, 0, 1, 0, 2, 4, 3, 3, 2, 5, 1, 3, 2, 3, 3, 3, 0, 4, 1, 2, 0, 3, 0, 4, 0, 2, 2, 1, 1, 5, 3, 0, 0, 1, 4, 2, 3, 2, 0, 3, 3, 3, 2, 0, 2, 4, 1, 1, 2, 0, 1, 1, 0, 3, 1, 0, 1, 3, 1, 2, 3, 0, 2, 0, 0, 0, 1, 3, 5, 4, 4, 4, 0, 3, 0, 0, 1, 3),
|
||||
(0, 4, 0, 5, 0, 4, 0, 4, 0, 4, 5, 4, 3, 3, 4, 3, 3, 3, 4, 3, 4, 4, 5, 3, 4, 5, 4, 2, 4, 2, 3, 4, 3, 1, 4, 4, 1, 3, 5, 4, 4, 5, 5, 4, 4, 5, 5, 5, 2, 3, 3, 1, 4, 3, 1, 3, 3, 0, 3, 3, 1, 4, 3, 4, 4, 4, 0, 3, 0, 4, 0, 3, 3, 4, 4, 5, 0, 0, 4, 3, 0, 4, 5),
|
||||
(0, 4, 0, 4, 0, 3, 0, 3, 0, 3, 4, 4, 4, 3, 3, 2, 4, 3, 4, 3, 4, 3, 5, 3, 4, 3, 2, 1, 4, 2, 4, 4, 3, 1, 3, 4, 2, 4, 5, 5, 3, 4, 5, 4, 1, 5, 4, 3, 0, 3, 2, 2, 3, 2, 1, 3, 1, 0, 3, 3, 3, 5, 3, 3, 3, 5, 4, 4, 2, 3, 3, 4, 3, 3, 3, 2, 1, 0, 3, 2, 1, 4, 3),
|
||||
(0, 4, 0, 5, 0, 4, 0, 3, 0, 3, 5, 5, 3, 2, 4, 3, 4, 0, 5, 4, 4, 1, 4, 4, 4, 3, 3, 3, 4, 3, 5, 5, 2, 3, 3, 4, 1, 2, 5, 5, 3, 5, 5, 2, 3, 5, 5, 4, 0, 3, 2, 0, 3, 3, 1, 1, 5, 1, 4, 1, 0, 4, 3, 2, 3, 5, 0, 4, 0, 3, 0, 5, 4, 3, 4, 3, 0, 0, 4, 1, 0, 4, 4),
|
||||
(1, 3, 0, 4, 0, 2, 0, 2, 0, 2, 5, 5, 3, 3, 3, 3, 3, 0, 4, 2, 3, 4, 4, 4, 3, 4, 0, 0, 3, 4, 5, 4, 3, 3, 3, 3, 2, 5, 5, 4, 5, 5, 5, 4, 3, 5, 5, 5, 1, 3, 1, 0, 1, 0, 0, 3, 2, 0, 4, 2, 0, 5, 2, 3, 2, 4, 1, 3, 0, 3, 0, 4, 5, 4, 5, 4, 3, 0, 4, 2, 0, 5, 4),
|
||||
(0, 3, 0, 4, 0, 5, 0, 3, 0, 3, 4, 4, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, 4, 3, 3, 2, 2, 0, 3, 3, 3, 3, 3, 1, 3, 3, 3, 0, 4, 4, 3, 4, 4, 1, 1, 4, 4, 2, 0, 3, 1, 0, 1, 1, 0, 4, 1, 0, 2, 3, 1, 3, 3, 1, 3, 4, 0, 3, 0, 1, 0, 3, 1, 3, 0, 0, 1, 0, 2, 0, 0, 4, 4),
|
||||
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
||||
(0, 3, 0, 3, 0, 2, 0, 3, 0, 1, 5, 4, 3, 3, 3, 1, 4, 2, 1, 2, 3, 4, 4, 2, 4, 4, 5, 0, 3, 1, 4, 3, 4, 0, 4, 3, 3, 3, 2, 3, 2, 5, 3, 4, 3, 2, 2, 3, 0, 0, 3, 0, 2, 1, 0, 1, 2, 0, 0, 0, 0, 2, 1, 1, 3, 1, 0, 2, 0, 4, 0, 3, 4, 4, 4, 5, 2, 0, 2, 0, 0, 1, 3),
|
||||
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 4, 2, 1, 1, 0, 1, 0, 3, 2, 0, 0, 3, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1, 4, 0, 4, 2, 1, 0, 0, 0, 0, 0, 1),
|
||||
(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 0, 2, 1, 0, 0, 1, 2, 1, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2),
|
||||
(0, 4, 0, 4, 0, 4, 0, 3, 0, 4, 4, 3, 4, 2, 4, 3, 2, 0, 4, 4, 4, 3, 5, 3, 5, 3, 3, 2, 4, 2, 4, 3, 4, 3, 1, 4, 0, 2, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 3, 4, 1, 3, 4, 3, 2, 1, 2, 1, 3, 3, 3, 4, 4, 3, 3, 5, 0, 4, 0, 3, 0, 4, 3, 3, 3, 2, 1, 0, 3, 0, 0, 3, 3),
|
||||
(0, 4, 0, 3, 0, 3, 0, 3, 0, 3, 5, 5, 3, 3, 3, 3, 4, 3, 4, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 4, 3, 5, 3, 3, 1, 3, 2, 4, 5, 5, 5, 5, 4, 3, 4, 5, 5, 3, 2, 2, 3, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 4, 3, 3, 3, 4, 0, 4, 0, 2, 0, 4, 3, 2, 2, 1, 2, 0, 3, 0, 0, 4, 1),
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
class JapaneseContextAnalysis(object):
|
||||
|
||||
class JapaneseContextAnalysis:
|
||||
NUM_OF_CATEGORY = 6
|
||||
DONT_KNOW = -1
|
||||
ENOUGH_REL_THRESHOLD = 100
|
||||
MAX_REL_THRESHOLD = 1000
|
||||
MINIMUM_DATA_THRESHOLD = 4
|
||||
|
||||
def __init__(self):
|
||||
self._total_rel = None
|
||||
self._rel_sample = None
|
||||
self._need_to_skip_char_num = None
|
||||
self._last_char_order = None
|
||||
self._done = None
|
||||
def __init__(self) -> None:
|
||||
self._total_rel = 0
|
||||
self._rel_sample: List[int] = []
|
||||
self._need_to_skip_char_num = 0
|
||||
self._last_char_order = -1
|
||||
self._done = False
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
self._total_rel = 0 # total sequence received
|
||||
# category counters, each integer counts sequence in its category
|
||||
self._rel_sample = [0] * self.NUM_OF_CATEGORY
|
||||
@@ -140,7 +144,7 @@ class JapaneseContextAnalysis(object):
|
||||
# been made
|
||||
self._done = False
|
||||
|
||||
def feed(self, byte_str, num_bytes):
|
||||
def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None:
|
||||
if self._done:
|
||||
return
|
||||
|
||||
@@ -153,7 +157,7 @@ class JapaneseContextAnalysis(object):
|
||||
# this character will simply our logic and improve performance.
|
||||
i = self._need_to_skip_char_num
|
||||
while i < num_bytes:
|
||||
order, char_len = self.get_order(byte_str[i:i + 2])
|
||||
order, char_len = self.get_order(byte_str[i : i + 2])
|
||||
i += char_len
|
||||
if i > num_bytes:
|
||||
self._need_to_skip_char_num = i - num_bytes
|
||||
@@ -164,32 +168,34 @@ class JapaneseContextAnalysis(object):
|
||||
if self._total_rel > self.MAX_REL_THRESHOLD:
|
||||
self._done = True
|
||||
break
|
||||
self._rel_sample[jp2CharContext[self._last_char_order][order]] += 1
|
||||
self._rel_sample[
|
||||
jp2_char_context[self._last_char_order][order]
|
||||
] += 1
|
||||
self._last_char_order = order
|
||||
|
||||
def got_enough_data(self):
|
||||
def got_enough_data(self) -> bool:
|
||||
return self._total_rel > self.ENOUGH_REL_THRESHOLD
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
# This is just one way to calculate confidence. It works well for me.
|
||||
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
|
||||
return (self._total_rel - self._rel_sample[0]) / self._total_rel
|
||||
else:
|
||||
return self.DONT_KNOW
|
||||
return self.DONT_KNOW
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||
return -1, 1
|
||||
|
||||
|
||||
class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||
def __init__(self):
|
||||
super(SJISContextAnalysis, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._charset_name = "SHIFT_JIS"
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return self._charset_name
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||
if not byte_str:
|
||||
return -1, 1
|
||||
# find out current char's byte length
|
||||
@@ -209,8 +215,9 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||
|
||||
return -1, char_len
|
||||
|
||||
|
||||
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||
if not byte_str:
|
||||
return -1, 1
|
||||
# find out current char's byte length
|
||||
@@ -229,5 +236,3 @@ class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||
return second_char - 0xA1, char_len
|
||||
|
||||
return -1, char_len
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,9 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from pip._vendor.chardet.sbcharsetprober import SingleByteCharSetModel
|
||||
|
||||
|
||||
# 3: Positive
|
||||
# 2: Likely
|
||||
# 1: Unlikely
|
||||
@@ -4115,269 +4111,270 @@ HEBREW_LANG_MODEL = {
|
||||
|
||||
# Character Mapping Table(s):
|
||||
WINDOWS_1255_HEBREW_CHAR_TO_ORDER = {
|
||||
0: 255, # '\x00'
|
||||
1: 255, # '\x01'
|
||||
2: 255, # '\x02'
|
||||
3: 255, # '\x03'
|
||||
4: 255, # '\x04'
|
||||
5: 255, # '\x05'
|
||||
6: 255, # '\x06'
|
||||
7: 255, # '\x07'
|
||||
8: 255, # '\x08'
|
||||
9: 255, # '\t'
|
||||
10: 254, # '\n'
|
||||
11: 255, # '\x0b'
|
||||
12: 255, # '\x0c'
|
||||
13: 254, # '\r'
|
||||
14: 255, # '\x0e'
|
||||
15: 255, # '\x0f'
|
||||
16: 255, # '\x10'
|
||||
17: 255, # '\x11'
|
||||
18: 255, # '\x12'
|
||||
19: 255, # '\x13'
|
||||
20: 255, # '\x14'
|
||||
21: 255, # '\x15'
|
||||
22: 255, # '\x16'
|
||||
23: 255, # '\x17'
|
||||
24: 255, # '\x18'
|
||||
25: 255, # '\x19'
|
||||
26: 255, # '\x1a'
|
||||
27: 255, # '\x1b'
|
||||
28: 255, # '\x1c'
|
||||
29: 255, # '\x1d'
|
||||
30: 255, # '\x1e'
|
||||
31: 255, # '\x1f'
|
||||
32: 253, # ' '
|
||||
33: 253, # '!'
|
||||
34: 253, # '"'
|
||||
35: 253, # '#'
|
||||
36: 253, # '$'
|
||||
37: 253, # '%'
|
||||
38: 253, # '&'
|
||||
39: 253, # "'"
|
||||
40: 253, # '('
|
||||
41: 253, # ')'
|
||||
42: 253, # '*'
|
||||
43: 253, # '+'
|
||||
44: 253, # ','
|
||||
45: 253, # '-'
|
||||
46: 253, # '.'
|
||||
47: 253, # '/'
|
||||
48: 252, # '0'
|
||||
49: 252, # '1'
|
||||
50: 252, # '2'
|
||||
51: 252, # '3'
|
||||
52: 252, # '4'
|
||||
53: 252, # '5'
|
||||
54: 252, # '6'
|
||||
55: 252, # '7'
|
||||
56: 252, # '8'
|
||||
57: 252, # '9'
|
||||
58: 253, # ':'
|
||||
59: 253, # ';'
|
||||
60: 253, # '<'
|
||||
61: 253, # '='
|
||||
62: 253, # '>'
|
||||
63: 253, # '?'
|
||||
64: 253, # '@'
|
||||
65: 69, # 'A'
|
||||
66: 91, # 'B'
|
||||
67: 79, # 'C'
|
||||
68: 80, # 'D'
|
||||
69: 92, # 'E'
|
||||
70: 89, # 'F'
|
||||
71: 97, # 'G'
|
||||
72: 90, # 'H'
|
||||
73: 68, # 'I'
|
||||
74: 111, # 'J'
|
||||
75: 112, # 'K'
|
||||
76: 82, # 'L'
|
||||
77: 73, # 'M'
|
||||
78: 95, # 'N'
|
||||
79: 85, # 'O'
|
||||
80: 78, # 'P'
|
||||
81: 121, # 'Q'
|
||||
82: 86, # 'R'
|
||||
83: 71, # 'S'
|
||||
84: 67, # 'T'
|
||||
85: 102, # 'U'
|
||||
86: 107, # 'V'
|
||||
87: 84, # 'W'
|
||||
88: 114, # 'X'
|
||||
89: 103, # 'Y'
|
||||
90: 115, # 'Z'
|
||||
91: 253, # '['
|
||||
92: 253, # '\\'
|
||||
93: 253, # ']'
|
||||
94: 253, # '^'
|
||||
95: 253, # '_'
|
||||
96: 253, # '`'
|
||||
97: 50, # 'a'
|
||||
98: 74, # 'b'
|
||||
99: 60, # 'c'
|
||||
100: 61, # 'd'
|
||||
101: 42, # 'e'
|
||||
102: 76, # 'f'
|
||||
103: 70, # 'g'
|
||||
104: 64, # 'h'
|
||||
105: 53, # 'i'
|
||||
106: 105, # 'j'
|
||||
107: 93, # 'k'
|
||||
108: 56, # 'l'
|
||||
109: 65, # 'm'
|
||||
110: 54, # 'n'
|
||||
111: 49, # 'o'
|
||||
112: 66, # 'p'
|
||||
113: 110, # 'q'
|
||||
114: 51, # 'r'
|
||||
115: 43, # 's'
|
||||
116: 44, # 't'
|
||||
117: 63, # 'u'
|
||||
118: 81, # 'v'
|
||||
119: 77, # 'w'
|
||||
120: 98, # 'x'
|
||||
121: 75, # 'y'
|
||||
122: 108, # 'z'
|
||||
123: 253, # '{'
|
||||
124: 253, # '|'
|
||||
125: 253, # '}'
|
||||
126: 253, # '~'
|
||||
127: 253, # '\x7f'
|
||||
128: 124, # '€'
|
||||
129: 202, # None
|
||||
130: 203, # '‚'
|
||||
131: 204, # 'ƒ'
|
||||
132: 205, # '„'
|
||||
133: 40, # '…'
|
||||
134: 58, # '†'
|
||||
135: 206, # '‡'
|
||||
136: 207, # 'ˆ'
|
||||
137: 208, # '‰'
|
||||
138: 209, # None
|
||||
139: 210, # '‹'
|
||||
140: 211, # None
|
||||
141: 212, # None
|
||||
142: 213, # None
|
||||
143: 214, # None
|
||||
144: 215, # None
|
||||
145: 83, # '‘'
|
||||
146: 52, # '’'
|
||||
147: 47, # '“'
|
||||
148: 46, # '”'
|
||||
149: 72, # '•'
|
||||
150: 32, # '–'
|
||||
151: 94, # '—'
|
||||
152: 216, # '˜'
|
||||
153: 113, # '™'
|
||||
154: 217, # None
|
||||
155: 109, # '›'
|
||||
156: 218, # None
|
||||
157: 219, # None
|
||||
158: 220, # None
|
||||
159: 221, # None
|
||||
160: 34, # '\xa0'
|
||||
161: 116, # '¡'
|
||||
162: 222, # '¢'
|
||||
163: 118, # '£'
|
||||
164: 100, # '₪'
|
||||
165: 223, # '¥'
|
||||
166: 224, # '¦'
|
||||
167: 117, # '§'
|
||||
168: 119, # '¨'
|
||||
169: 104, # '©'
|
||||
170: 125, # '×'
|
||||
171: 225, # '«'
|
||||
172: 226, # '¬'
|
||||
173: 87, # '\xad'
|
||||
174: 99, # '®'
|
||||
175: 227, # '¯'
|
||||
176: 106, # '°'
|
||||
177: 122, # '±'
|
||||
178: 123, # '²'
|
||||
179: 228, # '³'
|
||||
180: 55, # '´'
|
||||
181: 229, # 'µ'
|
||||
182: 230, # '¶'
|
||||
183: 101, # '·'
|
||||
184: 231, # '¸'
|
||||
185: 232, # '¹'
|
||||
186: 120, # '÷'
|
||||
187: 233, # '»'
|
||||
188: 48, # '¼'
|
||||
189: 39, # '½'
|
||||
190: 57, # '¾'
|
||||
191: 234, # '¿'
|
||||
192: 30, # 'ְ'
|
||||
193: 59, # 'ֱ'
|
||||
194: 41, # 'ֲ'
|
||||
195: 88, # 'ֳ'
|
||||
196: 33, # 'ִ'
|
||||
197: 37, # 'ֵ'
|
||||
198: 36, # 'ֶ'
|
||||
199: 31, # 'ַ'
|
||||
200: 29, # 'ָ'
|
||||
201: 35, # 'ֹ'
|
||||
202: 235, # None
|
||||
203: 62, # 'ֻ'
|
||||
204: 28, # 'ּ'
|
||||
205: 236, # 'ֽ'
|
||||
206: 126, # '־'
|
||||
207: 237, # 'ֿ'
|
||||
208: 238, # '׀'
|
||||
209: 38, # 'ׁ'
|
||||
210: 45, # 'ׂ'
|
||||
211: 239, # '׃'
|
||||
212: 240, # 'װ'
|
||||
213: 241, # 'ױ'
|
||||
214: 242, # 'ײ'
|
||||
215: 243, # '׳'
|
||||
216: 127, # '״'
|
||||
217: 244, # None
|
||||
218: 245, # None
|
||||
219: 246, # None
|
||||
220: 247, # None
|
||||
221: 248, # None
|
||||
222: 249, # None
|
||||
223: 250, # None
|
||||
224: 9, # 'א'
|
||||
225: 8, # 'ב'
|
||||
226: 20, # 'ג'
|
||||
227: 16, # 'ד'
|
||||
228: 3, # 'ה'
|
||||
229: 2, # 'ו'
|
||||
230: 24, # 'ז'
|
||||
231: 14, # 'ח'
|
||||
232: 22, # 'ט'
|
||||
233: 1, # 'י'
|
||||
234: 25, # 'ך'
|
||||
235: 15, # 'כ'
|
||||
236: 4, # 'ל'
|
||||
237: 11, # 'ם'
|
||||
238: 6, # 'מ'
|
||||
239: 23, # 'ן'
|
||||
240: 12, # 'נ'
|
||||
241: 19, # 'ס'
|
||||
242: 13, # 'ע'
|
||||
243: 26, # 'ף'
|
||||
244: 18, # 'פ'
|
||||
245: 27, # 'ץ'
|
||||
246: 21, # 'צ'
|
||||
247: 17, # 'ק'
|
||||
248: 7, # 'ר'
|
||||
249: 10, # 'ש'
|
||||
250: 5, # 'ת'
|
||||
251: 251, # None
|
||||
252: 252, # None
|
||||
253: 128, # '\u200e'
|
||||
254: 96, # '\u200f'
|
||||
255: 253, # None
|
||||
0: 255, # '\x00'
|
||||
1: 255, # '\x01'
|
||||
2: 255, # '\x02'
|
||||
3: 255, # '\x03'
|
||||
4: 255, # '\x04'
|
||||
5: 255, # '\x05'
|
||||
6: 255, # '\x06'
|
||||
7: 255, # '\x07'
|
||||
8: 255, # '\x08'
|
||||
9: 255, # '\t'
|
||||
10: 254, # '\n'
|
||||
11: 255, # '\x0b'
|
||||
12: 255, # '\x0c'
|
||||
13: 254, # '\r'
|
||||
14: 255, # '\x0e'
|
||||
15: 255, # '\x0f'
|
||||
16: 255, # '\x10'
|
||||
17: 255, # '\x11'
|
||||
18: 255, # '\x12'
|
||||
19: 255, # '\x13'
|
||||
20: 255, # '\x14'
|
||||
21: 255, # '\x15'
|
||||
22: 255, # '\x16'
|
||||
23: 255, # '\x17'
|
||||
24: 255, # '\x18'
|
||||
25: 255, # '\x19'
|
||||
26: 255, # '\x1a'
|
||||
27: 255, # '\x1b'
|
||||
28: 255, # '\x1c'
|
||||
29: 255, # '\x1d'
|
||||
30: 255, # '\x1e'
|
||||
31: 255, # '\x1f'
|
||||
32: 253, # ' '
|
||||
33: 253, # '!'
|
||||
34: 253, # '"'
|
||||
35: 253, # '#'
|
||||
36: 253, # '$'
|
||||
37: 253, # '%'
|
||||
38: 253, # '&'
|
||||
39: 253, # "'"
|
||||
40: 253, # '('
|
||||
41: 253, # ')'
|
||||
42: 253, # '*'
|
||||
43: 253, # '+'
|
||||
44: 253, # ','
|
||||
45: 253, # '-'
|
||||
46: 253, # '.'
|
||||
47: 253, # '/'
|
||||
48: 252, # '0'
|
||||
49: 252, # '1'
|
||||
50: 252, # '2'
|
||||
51: 252, # '3'
|
||||
52: 252, # '4'
|
||||
53: 252, # '5'
|
||||
54: 252, # '6'
|
||||
55: 252, # '7'
|
||||
56: 252, # '8'
|
||||
57: 252, # '9'
|
||||
58: 253, # ':'
|
||||
59: 253, # ';'
|
||||
60: 253, # '<'
|
||||
61: 253, # '='
|
||||
62: 253, # '>'
|
||||
63: 253, # '?'
|
||||
64: 253, # '@'
|
||||
65: 69, # 'A'
|
||||
66: 91, # 'B'
|
||||
67: 79, # 'C'
|
||||
68: 80, # 'D'
|
||||
69: 92, # 'E'
|
||||
70: 89, # 'F'
|
||||
71: 97, # 'G'
|
||||
72: 90, # 'H'
|
||||
73: 68, # 'I'
|
||||
74: 111, # 'J'
|
||||
75: 112, # 'K'
|
||||
76: 82, # 'L'
|
||||
77: 73, # 'M'
|
||||
78: 95, # 'N'
|
||||
79: 85, # 'O'
|
||||
80: 78, # 'P'
|
||||
81: 121, # 'Q'
|
||||
82: 86, # 'R'
|
||||
83: 71, # 'S'
|
||||
84: 67, # 'T'
|
||||
85: 102, # 'U'
|
||||
86: 107, # 'V'
|
||||
87: 84, # 'W'
|
||||
88: 114, # 'X'
|
||||
89: 103, # 'Y'
|
||||
90: 115, # 'Z'
|
||||
91: 253, # '['
|
||||
92: 253, # '\\'
|
||||
93: 253, # ']'
|
||||
94: 253, # '^'
|
||||
95: 253, # '_'
|
||||
96: 253, # '`'
|
||||
97: 50, # 'a'
|
||||
98: 74, # 'b'
|
||||
99: 60, # 'c'
|
||||
100: 61, # 'd'
|
||||
101: 42, # 'e'
|
||||
102: 76, # 'f'
|
||||
103: 70, # 'g'
|
||||
104: 64, # 'h'
|
||||
105: 53, # 'i'
|
||||
106: 105, # 'j'
|
||||
107: 93, # 'k'
|
||||
108: 56, # 'l'
|
||||
109: 65, # 'm'
|
||||
110: 54, # 'n'
|
||||
111: 49, # 'o'
|
||||
112: 66, # 'p'
|
||||
113: 110, # 'q'
|
||||
114: 51, # 'r'
|
||||
115: 43, # 's'
|
||||
116: 44, # 't'
|
||||
117: 63, # 'u'
|
||||
118: 81, # 'v'
|
||||
119: 77, # 'w'
|
||||
120: 98, # 'x'
|
||||
121: 75, # 'y'
|
||||
122: 108, # 'z'
|
||||
123: 253, # '{'
|
||||
124: 253, # '|'
|
||||
125: 253, # '}'
|
||||
126: 253, # '~'
|
||||
127: 253, # '\x7f'
|
||||
128: 124, # '€'
|
||||
129: 202, # None
|
||||
130: 203, # '‚'
|
||||
131: 204, # 'ƒ'
|
||||
132: 205, # '„'
|
||||
133: 40, # '…'
|
||||
134: 58, # '†'
|
||||
135: 206, # '‡'
|
||||
136: 207, # 'ˆ'
|
||||
137: 208, # '‰'
|
||||
138: 209, # None
|
||||
139: 210, # '‹'
|
||||
140: 211, # None
|
||||
141: 212, # None
|
||||
142: 213, # None
|
||||
143: 214, # None
|
||||
144: 215, # None
|
||||
145: 83, # '‘'
|
||||
146: 52, # '’'
|
||||
147: 47, # '“'
|
||||
148: 46, # '”'
|
||||
149: 72, # '•'
|
||||
150: 32, # '–'
|
||||
151: 94, # '—'
|
||||
152: 216, # '˜'
|
||||
153: 113, # '™'
|
||||
154: 217, # None
|
||||
155: 109, # '›'
|
||||
156: 218, # None
|
||||
157: 219, # None
|
||||
158: 220, # None
|
||||
159: 221, # None
|
||||
160: 34, # '\xa0'
|
||||
161: 116, # '¡'
|
||||
162: 222, # '¢'
|
||||
163: 118, # '£'
|
||||
164: 100, # '₪'
|
||||
165: 223, # '¥'
|
||||
166: 224, # '¦'
|
||||
167: 117, # '§'
|
||||
168: 119, # '¨'
|
||||
169: 104, # '©'
|
||||
170: 125, # '×'
|
||||
171: 225, # '«'
|
||||
172: 226, # '¬'
|
||||
173: 87, # '\xad'
|
||||
174: 99, # '®'
|
||||
175: 227, # '¯'
|
||||
176: 106, # '°'
|
||||
177: 122, # '±'
|
||||
178: 123, # '²'
|
||||
179: 228, # '³'
|
||||
180: 55, # '´'
|
||||
181: 229, # 'µ'
|
||||
182: 230, # '¶'
|
||||
183: 101, # '·'
|
||||
184: 231, # '¸'
|
||||
185: 232, # '¹'
|
||||
186: 120, # '÷'
|
||||
187: 233, # '»'
|
||||
188: 48, # '¼'
|
||||
189: 39, # '½'
|
||||
190: 57, # '¾'
|
||||
191: 234, # '¿'
|
||||
192: 30, # 'ְ'
|
||||
193: 59, # 'ֱ'
|
||||
194: 41, # 'ֲ'
|
||||
195: 88, # 'ֳ'
|
||||
196: 33, # 'ִ'
|
||||
197: 37, # 'ֵ'
|
||||
198: 36, # 'ֶ'
|
||||
199: 31, # 'ַ'
|
||||
200: 29, # 'ָ'
|
||||
201: 35, # 'ֹ'
|
||||
202: 235, # None
|
||||
203: 62, # 'ֻ'
|
||||
204: 28, # 'ּ'
|
||||
205: 236, # 'ֽ'
|
||||
206: 126, # '־'
|
||||
207: 237, # 'ֿ'
|
||||
208: 238, # '׀'
|
||||
209: 38, # 'ׁ'
|
||||
210: 45, # 'ׂ'
|
||||
211: 239, # '׃'
|
||||
212: 240, # 'װ'
|
||||
213: 241, # 'ױ'
|
||||
214: 242, # 'ײ'
|
||||
215: 243, # '׳'
|
||||
216: 127, # '״'
|
||||
217: 244, # None
|
||||
218: 245, # None
|
||||
219: 246, # None
|
||||
220: 247, # None
|
||||
221: 248, # None
|
||||
222: 249, # None
|
||||
223: 250, # None
|
||||
224: 9, # 'א'
|
||||
225: 8, # 'ב'
|
||||
226: 20, # 'ג'
|
||||
227: 16, # 'ד'
|
||||
228: 3, # 'ה'
|
||||
229: 2, # 'ו'
|
||||
230: 24, # 'ז'
|
||||
231: 14, # 'ח'
|
||||
232: 22, # 'ט'
|
||||
233: 1, # 'י'
|
||||
234: 25, # 'ך'
|
||||
235: 15, # 'כ'
|
||||
236: 4, # 'ל'
|
||||
237: 11, # 'ם'
|
||||
238: 6, # 'מ'
|
||||
239: 23, # 'ן'
|
||||
240: 12, # 'נ'
|
||||
241: 19, # 'ס'
|
||||
242: 13, # 'ע'
|
||||
243: 26, # 'ף'
|
||||
244: 18, # 'פ'
|
||||
245: 27, # 'ץ'
|
||||
246: 21, # 'צ'
|
||||
247: 17, # 'ק'
|
||||
248: 7, # 'ר'
|
||||
249: 10, # 'ש'
|
||||
250: 5, # 'ת'
|
||||
251: 251, # None
|
||||
252: 252, # None
|
||||
253: 128, # '\u200e'
|
||||
254: 96, # '\u200f'
|
||||
255: 253, # None
|
||||
}
|
||||
|
||||
WINDOWS_1255_HEBREW_MODEL = SingleByteCharSetModel(charset_name='windows-1255',
|
||||
language='Hebrew',
|
||||
char_to_order_map=WINDOWS_1255_HEBREW_CHAR_TO_ORDER,
|
||||
language_model=HEBREW_LANG_MODEL,
|
||||
typical_positive_ratio=0.984004,
|
||||
keep_ascii_letters=False,
|
||||
alphabet='אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ')
|
||||
|
||||
WINDOWS_1255_HEBREW_MODEL = SingleByteCharSetModel(
|
||||
charset_name="windows-1255",
|
||||
language="Hebrew",
|
||||
char_to_order_map=WINDOWS_1255_HEBREW_CHAR_TO_ORDER,
|
||||
language_model=HEBREW_LANG_MODEL,
|
||||
typical_positive_ratio=0.984004,
|
||||
keep_ascii_letters=False,
|
||||
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
|
||||
)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,9 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from pip._vendor.chardet.sbcharsetprober import SingleByteCharSetModel
|
||||
|
||||
|
||||
# 3: Positive
|
||||
# 2: Likely
|
||||
# 1: Unlikely
|
||||
@@ -4115,269 +4111,270 @@ THAI_LANG_MODEL = {
|
||||
|
||||
# Character Mapping Table(s):
|
||||
TIS_620_THAI_CHAR_TO_ORDER = {
|
||||
0: 255, # '\x00'
|
||||
1: 255, # '\x01'
|
||||
2: 255, # '\x02'
|
||||
3: 255, # '\x03'
|
||||
4: 255, # '\x04'
|
||||
5: 255, # '\x05'
|
||||
6: 255, # '\x06'
|
||||
7: 255, # '\x07'
|
||||
8: 255, # '\x08'
|
||||
9: 255, # '\t'
|
||||
10: 254, # '\n'
|
||||
11: 255, # '\x0b'
|
||||
12: 255, # '\x0c'
|
||||
13: 254, # '\r'
|
||||
14: 255, # '\x0e'
|
||||
15: 255, # '\x0f'
|
||||
16: 255, # '\x10'
|
||||
17: 255, # '\x11'
|
||||
18: 255, # '\x12'
|
||||
19: 255, # '\x13'
|
||||
20: 255, # '\x14'
|
||||
21: 255, # '\x15'
|
||||
22: 255, # '\x16'
|
||||
23: 255, # '\x17'
|
||||
24: 255, # '\x18'
|
||||
25: 255, # '\x19'
|
||||
26: 255, # '\x1a'
|
||||
27: 255, # '\x1b'
|
||||
28: 255, # '\x1c'
|
||||
29: 255, # '\x1d'
|
||||
30: 255, # '\x1e'
|
||||
31: 255, # '\x1f'
|
||||
32: 253, # ' '
|
||||
33: 253, # '!'
|
||||
34: 253, # '"'
|
||||
35: 253, # '#'
|
||||
36: 253, # '$'
|
||||
37: 253, # '%'
|
||||
38: 253, # '&'
|
||||
39: 253, # "'"
|
||||
40: 253, # '('
|
||||
41: 253, # ')'
|
||||
42: 253, # '*'
|
||||
43: 253, # '+'
|
||||
44: 253, # ','
|
||||
45: 253, # '-'
|
||||
46: 253, # '.'
|
||||
47: 253, # '/'
|
||||
48: 252, # '0'
|
||||
49: 252, # '1'
|
||||
50: 252, # '2'
|
||||
51: 252, # '3'
|
||||
52: 252, # '4'
|
||||
53: 252, # '5'
|
||||
54: 252, # '6'
|
||||
55: 252, # '7'
|
||||
56: 252, # '8'
|
||||
57: 252, # '9'
|
||||
58: 253, # ':'
|
||||
59: 253, # ';'
|
||||
60: 253, # '<'
|
||||
61: 253, # '='
|
||||
62: 253, # '>'
|
||||
63: 253, # '?'
|
||||
64: 253, # '@'
|
||||
65: 182, # 'A'
|
||||
66: 106, # 'B'
|
||||
67: 107, # 'C'
|
||||
68: 100, # 'D'
|
||||
69: 183, # 'E'
|
||||
70: 184, # 'F'
|
||||
71: 185, # 'G'
|
||||
72: 101, # 'H'
|
||||
73: 94, # 'I'
|
||||
74: 186, # 'J'
|
||||
75: 187, # 'K'
|
||||
76: 108, # 'L'
|
||||
77: 109, # 'M'
|
||||
78: 110, # 'N'
|
||||
79: 111, # 'O'
|
||||
80: 188, # 'P'
|
||||
81: 189, # 'Q'
|
||||
82: 190, # 'R'
|
||||
83: 89, # 'S'
|
||||
84: 95, # 'T'
|
||||
85: 112, # 'U'
|
||||
86: 113, # 'V'
|
||||
87: 191, # 'W'
|
||||
88: 192, # 'X'
|
||||
89: 193, # 'Y'
|
||||
90: 194, # 'Z'
|
||||
91: 253, # '['
|
||||
92: 253, # '\\'
|
||||
93: 253, # ']'
|
||||
94: 253, # '^'
|
||||
95: 253, # '_'
|
||||
96: 253, # '`'
|
||||
97: 64, # 'a'
|
||||
98: 72, # 'b'
|
||||
99: 73, # 'c'
|
||||
100: 114, # 'd'
|
||||
101: 74, # 'e'
|
||||
102: 115, # 'f'
|
||||
103: 116, # 'g'
|
||||
104: 102, # 'h'
|
||||
105: 81, # 'i'
|
||||
106: 201, # 'j'
|
||||
107: 117, # 'k'
|
||||
108: 90, # 'l'
|
||||
109: 103, # 'm'
|
||||
110: 78, # 'n'
|
||||
111: 82, # 'o'
|
||||
112: 96, # 'p'
|
||||
113: 202, # 'q'
|
||||
114: 91, # 'r'
|
||||
115: 79, # 's'
|
||||
116: 84, # 't'
|
||||
117: 104, # 'u'
|
||||
118: 105, # 'v'
|
||||
119: 97, # 'w'
|
||||
120: 98, # 'x'
|
||||
121: 92, # 'y'
|
||||
122: 203, # 'z'
|
||||
123: 253, # '{'
|
||||
124: 253, # '|'
|
||||
125: 253, # '}'
|
||||
126: 253, # '~'
|
||||
127: 253, # '\x7f'
|
||||
128: 209, # '\x80'
|
||||
129: 210, # '\x81'
|
||||
130: 211, # '\x82'
|
||||
131: 212, # '\x83'
|
||||
132: 213, # '\x84'
|
||||
133: 88, # '\x85'
|
||||
134: 214, # '\x86'
|
||||
135: 215, # '\x87'
|
||||
136: 216, # '\x88'
|
||||
137: 217, # '\x89'
|
||||
138: 218, # '\x8a'
|
||||
139: 219, # '\x8b'
|
||||
140: 220, # '\x8c'
|
||||
141: 118, # '\x8d'
|
||||
142: 221, # '\x8e'
|
||||
143: 222, # '\x8f'
|
||||
144: 223, # '\x90'
|
||||
145: 224, # '\x91'
|
||||
146: 99, # '\x92'
|
||||
147: 85, # '\x93'
|
||||
148: 83, # '\x94'
|
||||
149: 225, # '\x95'
|
||||
150: 226, # '\x96'
|
||||
151: 227, # '\x97'
|
||||
152: 228, # '\x98'
|
||||
153: 229, # '\x99'
|
||||
154: 230, # '\x9a'
|
||||
155: 231, # '\x9b'
|
||||
156: 232, # '\x9c'
|
||||
157: 233, # '\x9d'
|
||||
158: 234, # '\x9e'
|
||||
159: 235, # '\x9f'
|
||||
160: 236, # None
|
||||
161: 5, # 'ก'
|
||||
162: 30, # 'ข'
|
||||
163: 237, # 'ฃ'
|
||||
164: 24, # 'ค'
|
||||
165: 238, # 'ฅ'
|
||||
166: 75, # 'ฆ'
|
||||
167: 8, # 'ง'
|
||||
168: 26, # 'จ'
|
||||
169: 52, # 'ฉ'
|
||||
170: 34, # 'ช'
|
||||
171: 51, # 'ซ'
|
||||
172: 119, # 'ฌ'
|
||||
173: 47, # 'ญ'
|
||||
174: 58, # 'ฎ'
|
||||
175: 57, # 'ฏ'
|
||||
176: 49, # 'ฐ'
|
||||
177: 53, # 'ฑ'
|
||||
178: 55, # 'ฒ'
|
||||
179: 43, # 'ณ'
|
||||
180: 20, # 'ด'
|
||||
181: 19, # 'ต'
|
||||
182: 44, # 'ถ'
|
||||
183: 14, # 'ท'
|
||||
184: 48, # 'ธ'
|
||||
185: 3, # 'น'
|
||||
186: 17, # 'บ'
|
||||
187: 25, # 'ป'
|
||||
188: 39, # 'ผ'
|
||||
189: 62, # 'ฝ'
|
||||
190: 31, # 'พ'
|
||||
191: 54, # 'ฟ'
|
||||
192: 45, # 'ภ'
|
||||
193: 9, # 'ม'
|
||||
194: 16, # 'ย'
|
||||
195: 2, # 'ร'
|
||||
196: 61, # 'ฤ'
|
||||
197: 15, # 'ล'
|
||||
198: 239, # 'ฦ'
|
||||
199: 12, # 'ว'
|
||||
200: 42, # 'ศ'
|
||||
201: 46, # 'ษ'
|
||||
202: 18, # 'ส'
|
||||
203: 21, # 'ห'
|
||||
204: 76, # 'ฬ'
|
||||
205: 4, # 'อ'
|
||||
206: 66, # 'ฮ'
|
||||
207: 63, # 'ฯ'
|
||||
208: 22, # 'ะ'
|
||||
209: 10, # 'ั'
|
||||
210: 1, # 'า'
|
||||
211: 36, # 'ำ'
|
||||
212: 23, # 'ิ'
|
||||
213: 13, # 'ี'
|
||||
214: 40, # 'ึ'
|
||||
215: 27, # 'ื'
|
||||
216: 32, # 'ุ'
|
||||
217: 35, # 'ู'
|
||||
218: 86, # 'ฺ'
|
||||
219: 240, # None
|
||||
220: 241, # None
|
||||
221: 242, # None
|
||||
222: 243, # None
|
||||
223: 244, # '฿'
|
||||
224: 11, # 'เ'
|
||||
225: 28, # 'แ'
|
||||
226: 41, # 'โ'
|
||||
227: 29, # 'ใ'
|
||||
228: 33, # 'ไ'
|
||||
229: 245, # 'ๅ'
|
||||
230: 50, # 'ๆ'
|
||||
231: 37, # '็'
|
||||
232: 6, # '่'
|
||||
233: 7, # '้'
|
||||
234: 67, # '๊'
|
||||
235: 77, # '๋'
|
||||
236: 38, # '์'
|
||||
237: 93, # 'ํ'
|
||||
238: 246, # '๎'
|
||||
239: 247, # '๏'
|
||||
240: 68, # '๐'
|
||||
241: 56, # '๑'
|
||||
242: 59, # '๒'
|
||||
243: 65, # '๓'
|
||||
244: 69, # '๔'
|
||||
245: 60, # '๕'
|
||||
246: 70, # '๖'
|
||||
247: 80, # '๗'
|
||||
248: 71, # '๘'
|
||||
249: 87, # '๙'
|
||||
250: 248, # '๚'
|
||||
251: 249, # '๛'
|
||||
252: 250, # None
|
||||
253: 251, # None
|
||||
254: 252, # None
|
||||
255: 253, # None
|
||||
0: 255, # '\x00'
|
||||
1: 255, # '\x01'
|
||||
2: 255, # '\x02'
|
||||
3: 255, # '\x03'
|
||||
4: 255, # '\x04'
|
||||
5: 255, # '\x05'
|
||||
6: 255, # '\x06'
|
||||
7: 255, # '\x07'
|
||||
8: 255, # '\x08'
|
||||
9: 255, # '\t'
|
||||
10: 254, # '\n'
|
||||
11: 255, # '\x0b'
|
||||
12: 255, # '\x0c'
|
||||
13: 254, # '\r'
|
||||
14: 255, # '\x0e'
|
||||
15: 255, # '\x0f'
|
||||
16: 255, # '\x10'
|
||||
17: 255, # '\x11'
|
||||
18: 255, # '\x12'
|
||||
19: 255, # '\x13'
|
||||
20: 255, # '\x14'
|
||||
21: 255, # '\x15'
|
||||
22: 255, # '\x16'
|
||||
23: 255, # '\x17'
|
||||
24: 255, # '\x18'
|
||||
25: 255, # '\x19'
|
||||
26: 255, # '\x1a'
|
||||
27: 255, # '\x1b'
|
||||
28: 255, # '\x1c'
|
||||
29: 255, # '\x1d'
|
||||
30: 255, # '\x1e'
|
||||
31: 255, # '\x1f'
|
||||
32: 253, # ' '
|
||||
33: 253, # '!'
|
||||
34: 253, # '"'
|
||||
35: 253, # '#'
|
||||
36: 253, # '$'
|
||||
37: 253, # '%'
|
||||
38: 253, # '&'
|
||||
39: 253, # "'"
|
||||
40: 253, # '('
|
||||
41: 253, # ')'
|
||||
42: 253, # '*'
|
||||
43: 253, # '+'
|
||||
44: 253, # ','
|
||||
45: 253, # '-'
|
||||
46: 253, # '.'
|
||||
47: 253, # '/'
|
||||
48: 252, # '0'
|
||||
49: 252, # '1'
|
||||
50: 252, # '2'
|
||||
51: 252, # '3'
|
||||
52: 252, # '4'
|
||||
53: 252, # '5'
|
||||
54: 252, # '6'
|
||||
55: 252, # '7'
|
||||
56: 252, # '8'
|
||||
57: 252, # '9'
|
||||
58: 253, # ':'
|
||||
59: 253, # ';'
|
||||
60: 253, # '<'
|
||||
61: 253, # '='
|
||||
62: 253, # '>'
|
||||
63: 253, # '?'
|
||||
64: 253, # '@'
|
||||
65: 182, # 'A'
|
||||
66: 106, # 'B'
|
||||
67: 107, # 'C'
|
||||
68: 100, # 'D'
|
||||
69: 183, # 'E'
|
||||
70: 184, # 'F'
|
||||
71: 185, # 'G'
|
||||
72: 101, # 'H'
|
||||
73: 94, # 'I'
|
||||
74: 186, # 'J'
|
||||
75: 187, # 'K'
|
||||
76: 108, # 'L'
|
||||
77: 109, # 'M'
|
||||
78: 110, # 'N'
|
||||
79: 111, # 'O'
|
||||
80: 188, # 'P'
|
||||
81: 189, # 'Q'
|
||||
82: 190, # 'R'
|
||||
83: 89, # 'S'
|
||||
84: 95, # 'T'
|
||||
85: 112, # 'U'
|
||||
86: 113, # 'V'
|
||||
87: 191, # 'W'
|
||||
88: 192, # 'X'
|
||||
89: 193, # 'Y'
|
||||
90: 194, # 'Z'
|
||||
91: 253, # '['
|
||||
92: 253, # '\\'
|
||||
93: 253, # ']'
|
||||
94: 253, # '^'
|
||||
95: 253, # '_'
|
||||
96: 253, # '`'
|
||||
97: 64, # 'a'
|
||||
98: 72, # 'b'
|
||||
99: 73, # 'c'
|
||||
100: 114, # 'd'
|
||||
101: 74, # 'e'
|
||||
102: 115, # 'f'
|
||||
103: 116, # 'g'
|
||||
104: 102, # 'h'
|
||||
105: 81, # 'i'
|
||||
106: 201, # 'j'
|
||||
107: 117, # 'k'
|
||||
108: 90, # 'l'
|
||||
109: 103, # 'm'
|
||||
110: 78, # 'n'
|
||||
111: 82, # 'o'
|
||||
112: 96, # 'p'
|
||||
113: 202, # 'q'
|
||||
114: 91, # 'r'
|
||||
115: 79, # 's'
|
||||
116: 84, # 't'
|
||||
117: 104, # 'u'
|
||||
118: 105, # 'v'
|
||||
119: 97, # 'w'
|
||||
120: 98, # 'x'
|
||||
121: 92, # 'y'
|
||||
122: 203, # 'z'
|
||||
123: 253, # '{'
|
||||
124: 253, # '|'
|
||||
125: 253, # '}'
|
||||
126: 253, # '~'
|
||||
127: 253, # '\x7f'
|
||||
128: 209, # '\x80'
|
||||
129: 210, # '\x81'
|
||||
130: 211, # '\x82'
|
||||
131: 212, # '\x83'
|
||||
132: 213, # '\x84'
|
||||
133: 88, # '\x85'
|
||||
134: 214, # '\x86'
|
||||
135: 215, # '\x87'
|
||||
136: 216, # '\x88'
|
||||
137: 217, # '\x89'
|
||||
138: 218, # '\x8a'
|
||||
139: 219, # '\x8b'
|
||||
140: 220, # '\x8c'
|
||||
141: 118, # '\x8d'
|
||||
142: 221, # '\x8e'
|
||||
143: 222, # '\x8f'
|
||||
144: 223, # '\x90'
|
||||
145: 224, # '\x91'
|
||||
146: 99, # '\x92'
|
||||
147: 85, # '\x93'
|
||||
148: 83, # '\x94'
|
||||
149: 225, # '\x95'
|
||||
150: 226, # '\x96'
|
||||
151: 227, # '\x97'
|
||||
152: 228, # '\x98'
|
||||
153: 229, # '\x99'
|
||||
154: 230, # '\x9a'
|
||||
155: 231, # '\x9b'
|
||||
156: 232, # '\x9c'
|
||||
157: 233, # '\x9d'
|
||||
158: 234, # '\x9e'
|
||||
159: 235, # '\x9f'
|
||||
160: 236, # None
|
||||
161: 5, # 'ก'
|
||||
162: 30, # 'ข'
|
||||
163: 237, # 'ฃ'
|
||||
164: 24, # 'ค'
|
||||
165: 238, # 'ฅ'
|
||||
166: 75, # 'ฆ'
|
||||
167: 8, # 'ง'
|
||||
168: 26, # 'จ'
|
||||
169: 52, # 'ฉ'
|
||||
170: 34, # 'ช'
|
||||
171: 51, # 'ซ'
|
||||
172: 119, # 'ฌ'
|
||||
173: 47, # 'ญ'
|
||||
174: 58, # 'ฎ'
|
||||
175: 57, # 'ฏ'
|
||||
176: 49, # 'ฐ'
|
||||
177: 53, # 'ฑ'
|
||||
178: 55, # 'ฒ'
|
||||
179: 43, # 'ณ'
|
||||
180: 20, # 'ด'
|
||||
181: 19, # 'ต'
|
||||
182: 44, # 'ถ'
|
||||
183: 14, # 'ท'
|
||||
184: 48, # 'ธ'
|
||||
185: 3, # 'น'
|
||||
186: 17, # 'บ'
|
||||
187: 25, # 'ป'
|
||||
188: 39, # 'ผ'
|
||||
189: 62, # 'ฝ'
|
||||
190: 31, # 'พ'
|
||||
191: 54, # 'ฟ'
|
||||
192: 45, # 'ภ'
|
||||
193: 9, # 'ม'
|
||||
194: 16, # 'ย'
|
||||
195: 2, # 'ร'
|
||||
196: 61, # 'ฤ'
|
||||
197: 15, # 'ล'
|
||||
198: 239, # 'ฦ'
|
||||
199: 12, # 'ว'
|
||||
200: 42, # 'ศ'
|
||||
201: 46, # 'ษ'
|
||||
202: 18, # 'ส'
|
||||
203: 21, # 'ห'
|
||||
204: 76, # 'ฬ'
|
||||
205: 4, # 'อ'
|
||||
206: 66, # 'ฮ'
|
||||
207: 63, # 'ฯ'
|
||||
208: 22, # 'ะ'
|
||||
209: 10, # 'ั'
|
||||
210: 1, # 'า'
|
||||
211: 36, # 'ำ'
|
||||
212: 23, # 'ิ'
|
||||
213: 13, # 'ี'
|
||||
214: 40, # 'ึ'
|
||||
215: 27, # 'ื'
|
||||
216: 32, # 'ุ'
|
||||
217: 35, # 'ู'
|
||||
218: 86, # 'ฺ'
|
||||
219: 240, # None
|
||||
220: 241, # None
|
||||
221: 242, # None
|
||||
222: 243, # None
|
||||
223: 244, # '฿'
|
||||
224: 11, # 'เ'
|
||||
225: 28, # 'แ'
|
||||
226: 41, # 'โ'
|
||||
227: 29, # 'ใ'
|
||||
228: 33, # 'ไ'
|
||||
229: 245, # 'ๅ'
|
||||
230: 50, # 'ๆ'
|
||||
231: 37, # '็'
|
||||
232: 6, # '่'
|
||||
233: 7, # '้'
|
||||
234: 67, # '๊'
|
||||
235: 77, # '๋'
|
||||
236: 38, # '์'
|
||||
237: 93, # 'ํ'
|
||||
238: 246, # '๎'
|
||||
239: 247, # '๏'
|
||||
240: 68, # '๐'
|
||||
241: 56, # '๑'
|
||||
242: 59, # '๒'
|
||||
243: 65, # '๓'
|
||||
244: 69, # '๔'
|
||||
245: 60, # '๕'
|
||||
246: 70, # '๖'
|
||||
247: 80, # '๗'
|
||||
248: 71, # '๘'
|
||||
249: 87, # '๙'
|
||||
250: 248, # '๚'
|
||||
251: 249, # '๛'
|
||||
252: 250, # None
|
||||
253: 251, # None
|
||||
254: 252, # None
|
||||
255: 253, # None
|
||||
}
|
||||
|
||||
TIS_620_THAI_MODEL = SingleByteCharSetModel(charset_name='TIS-620',
|
||||
language='Thai',
|
||||
char_to_order_map=TIS_620_THAI_CHAR_TO_ORDER,
|
||||
language_model=THAI_LANG_MODEL,
|
||||
typical_positive_ratio=0.926386,
|
||||
keep_ascii_letters=False,
|
||||
alphabet='กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛')
|
||||
|
||||
TIS_620_THAI_MODEL = SingleByteCharSetModel(
|
||||
charset_name="TIS-620",
|
||||
language="Thai",
|
||||
char_to_order_map=TIS_620_THAI_CHAR_TO_ORDER,
|
||||
language_model=THAI_LANG_MODEL,
|
||||
typical_positive_ratio=0.926386,
|
||||
keep_ascii_letters=False,
|
||||
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
|
||||
)
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from pip._vendor.chardet.sbcharsetprober import SingleByteCharSetModel
|
||||
|
||||
|
||||
# 3: Positive
|
||||
# 2: Likely
|
||||
# 1: Unlikely
|
||||
@@ -4115,269 +4111,270 @@ TURKISH_LANG_MODEL = {
|
||||
|
||||
# Character Mapping Table(s):
|
||||
ISO_8859_9_TURKISH_CHAR_TO_ORDER = {
|
||||
0: 255, # '\x00'
|
||||
1: 255, # '\x01'
|
||||
2: 255, # '\x02'
|
||||
3: 255, # '\x03'
|
||||
4: 255, # '\x04'
|
||||
5: 255, # '\x05'
|
||||
6: 255, # '\x06'
|
||||
7: 255, # '\x07'
|
||||
8: 255, # '\x08'
|
||||
9: 255, # '\t'
|
||||
10: 255, # '\n'
|
||||
11: 255, # '\x0b'
|
||||
12: 255, # '\x0c'
|
||||
13: 255, # '\r'
|
||||
14: 255, # '\x0e'
|
||||
15: 255, # '\x0f'
|
||||
16: 255, # '\x10'
|
||||
17: 255, # '\x11'
|
||||
18: 255, # '\x12'
|
||||
19: 255, # '\x13'
|
||||
20: 255, # '\x14'
|
||||
21: 255, # '\x15'
|
||||
22: 255, # '\x16'
|
||||
23: 255, # '\x17'
|
||||
24: 255, # '\x18'
|
||||
25: 255, # '\x19'
|
||||
26: 255, # '\x1a'
|
||||
27: 255, # '\x1b'
|
||||
28: 255, # '\x1c'
|
||||
29: 255, # '\x1d'
|
||||
30: 255, # '\x1e'
|
||||
31: 255, # '\x1f'
|
||||
32: 255, # ' '
|
||||
33: 255, # '!'
|
||||
34: 255, # '"'
|
||||
35: 255, # '#'
|
||||
36: 255, # '$'
|
||||
37: 255, # '%'
|
||||
38: 255, # '&'
|
||||
39: 255, # "'"
|
||||
40: 255, # '('
|
||||
41: 255, # ')'
|
||||
42: 255, # '*'
|
||||
43: 255, # '+'
|
||||
44: 255, # ','
|
||||
45: 255, # '-'
|
||||
46: 255, # '.'
|
||||
47: 255, # '/'
|
||||
48: 255, # '0'
|
||||
49: 255, # '1'
|
||||
50: 255, # '2'
|
||||
51: 255, # '3'
|
||||
52: 255, # '4'
|
||||
53: 255, # '5'
|
||||
54: 255, # '6'
|
||||
55: 255, # '7'
|
||||
56: 255, # '8'
|
||||
57: 255, # '9'
|
||||
58: 255, # ':'
|
||||
59: 255, # ';'
|
||||
60: 255, # '<'
|
||||
61: 255, # '='
|
||||
62: 255, # '>'
|
||||
63: 255, # '?'
|
||||
64: 255, # '@'
|
||||
65: 23, # 'A'
|
||||
66: 37, # 'B'
|
||||
67: 47, # 'C'
|
||||
68: 39, # 'D'
|
||||
69: 29, # 'E'
|
||||
70: 52, # 'F'
|
||||
71: 36, # 'G'
|
||||
72: 45, # 'H'
|
||||
73: 53, # 'I'
|
||||
74: 60, # 'J'
|
||||
75: 16, # 'K'
|
||||
76: 49, # 'L'
|
||||
77: 20, # 'M'
|
||||
78: 46, # 'N'
|
||||
79: 42, # 'O'
|
||||
80: 48, # 'P'
|
||||
81: 69, # 'Q'
|
||||
82: 44, # 'R'
|
||||
83: 35, # 'S'
|
||||
84: 31, # 'T'
|
||||
85: 51, # 'U'
|
||||
86: 38, # 'V'
|
||||
87: 62, # 'W'
|
||||
88: 65, # 'X'
|
||||
89: 43, # 'Y'
|
||||
90: 56, # 'Z'
|
||||
91: 255, # '['
|
||||
92: 255, # '\\'
|
||||
93: 255, # ']'
|
||||
94: 255, # '^'
|
||||
95: 255, # '_'
|
||||
96: 255, # '`'
|
||||
97: 1, # 'a'
|
||||
98: 21, # 'b'
|
||||
99: 28, # 'c'
|
||||
100: 12, # 'd'
|
||||
101: 2, # 'e'
|
||||
102: 18, # 'f'
|
||||
103: 27, # 'g'
|
||||
104: 25, # 'h'
|
||||
105: 3, # 'i'
|
||||
106: 24, # 'j'
|
||||
107: 10, # 'k'
|
||||
108: 5, # 'l'
|
||||
109: 13, # 'm'
|
||||
110: 4, # 'n'
|
||||
111: 15, # 'o'
|
||||
112: 26, # 'p'
|
||||
113: 64, # 'q'
|
||||
114: 7, # 'r'
|
||||
115: 8, # 's'
|
||||
116: 9, # 't'
|
||||
117: 14, # 'u'
|
||||
118: 32, # 'v'
|
||||
119: 57, # 'w'
|
||||
120: 58, # 'x'
|
||||
121: 11, # 'y'
|
||||
122: 22, # 'z'
|
||||
123: 255, # '{'
|
||||
124: 255, # '|'
|
||||
125: 255, # '}'
|
||||
126: 255, # '~'
|
||||
127: 255, # '\x7f'
|
||||
128: 180, # '\x80'
|
||||
129: 179, # '\x81'
|
||||
130: 178, # '\x82'
|
||||
131: 177, # '\x83'
|
||||
132: 176, # '\x84'
|
||||
133: 175, # '\x85'
|
||||
134: 174, # '\x86'
|
||||
135: 173, # '\x87'
|
||||
136: 172, # '\x88'
|
||||
137: 171, # '\x89'
|
||||
138: 170, # '\x8a'
|
||||
139: 169, # '\x8b'
|
||||
140: 168, # '\x8c'
|
||||
141: 167, # '\x8d'
|
||||
142: 166, # '\x8e'
|
||||
143: 165, # '\x8f'
|
||||
144: 164, # '\x90'
|
||||
145: 163, # '\x91'
|
||||
146: 162, # '\x92'
|
||||
147: 161, # '\x93'
|
||||
148: 160, # '\x94'
|
||||
149: 159, # '\x95'
|
||||
150: 101, # '\x96'
|
||||
151: 158, # '\x97'
|
||||
152: 157, # '\x98'
|
||||
153: 156, # '\x99'
|
||||
154: 155, # '\x9a'
|
||||
155: 154, # '\x9b'
|
||||
156: 153, # '\x9c'
|
||||
157: 152, # '\x9d'
|
||||
158: 151, # '\x9e'
|
||||
159: 106, # '\x9f'
|
||||
160: 150, # '\xa0'
|
||||
161: 149, # '¡'
|
||||
162: 148, # '¢'
|
||||
163: 147, # '£'
|
||||
164: 146, # '¤'
|
||||
165: 145, # '¥'
|
||||
166: 144, # '¦'
|
||||
167: 100, # '§'
|
||||
168: 143, # '¨'
|
||||
169: 142, # '©'
|
||||
170: 141, # 'ª'
|
||||
171: 140, # '«'
|
||||
172: 139, # '¬'
|
||||
173: 138, # '\xad'
|
||||
174: 137, # '®'
|
||||
175: 136, # '¯'
|
||||
176: 94, # '°'
|
||||
177: 80, # '±'
|
||||
178: 93, # '²'
|
||||
179: 135, # '³'
|
||||
180: 105, # '´'
|
||||
181: 134, # 'µ'
|
||||
182: 133, # '¶'
|
||||
183: 63, # '·'
|
||||
184: 132, # '¸'
|
||||
185: 131, # '¹'
|
||||
186: 130, # 'º'
|
||||
187: 129, # '»'
|
||||
188: 128, # '¼'
|
||||
189: 127, # '½'
|
||||
190: 126, # '¾'
|
||||
191: 125, # '¿'
|
||||
192: 124, # 'À'
|
||||
193: 104, # 'Á'
|
||||
194: 73, # 'Â'
|
||||
195: 99, # 'Ã'
|
||||
196: 79, # 'Ä'
|
||||
197: 85, # 'Å'
|
||||
198: 123, # 'Æ'
|
||||
199: 54, # 'Ç'
|
||||
200: 122, # 'È'
|
||||
201: 98, # 'É'
|
||||
202: 92, # 'Ê'
|
||||
203: 121, # 'Ë'
|
||||
204: 120, # 'Ì'
|
||||
205: 91, # 'Í'
|
||||
206: 103, # 'Î'
|
||||
207: 119, # 'Ï'
|
||||
208: 68, # 'Ğ'
|
||||
209: 118, # 'Ñ'
|
||||
210: 117, # 'Ò'
|
||||
211: 97, # 'Ó'
|
||||
212: 116, # 'Ô'
|
||||
213: 115, # 'Õ'
|
||||
214: 50, # 'Ö'
|
||||
215: 90, # '×'
|
||||
216: 114, # 'Ø'
|
||||
217: 113, # 'Ù'
|
||||
218: 112, # 'Ú'
|
||||
219: 111, # 'Û'
|
||||
220: 55, # 'Ü'
|
||||
221: 41, # 'İ'
|
||||
222: 40, # 'Ş'
|
||||
223: 86, # 'ß'
|
||||
224: 89, # 'à'
|
||||
225: 70, # 'á'
|
||||
226: 59, # 'â'
|
||||
227: 78, # 'ã'
|
||||
228: 71, # 'ä'
|
||||
229: 82, # 'å'
|
||||
230: 88, # 'æ'
|
||||
231: 33, # 'ç'
|
||||
232: 77, # 'è'
|
||||
233: 66, # 'é'
|
||||
234: 84, # 'ê'
|
||||
235: 83, # 'ë'
|
||||
236: 110, # 'ì'
|
||||
237: 75, # 'í'
|
||||
238: 61, # 'î'
|
||||
239: 96, # 'ï'
|
||||
240: 30, # 'ğ'
|
||||
241: 67, # 'ñ'
|
||||
242: 109, # 'ò'
|
||||
243: 74, # 'ó'
|
||||
244: 87, # 'ô'
|
||||
245: 102, # 'õ'
|
||||
246: 34, # 'ö'
|
||||
247: 95, # '÷'
|
||||
248: 81, # 'ø'
|
||||
249: 108, # 'ù'
|
||||
250: 76, # 'ú'
|
||||
251: 72, # 'û'
|
||||
252: 17, # 'ü'
|
||||
253: 6, # 'ı'
|
||||
254: 19, # 'ş'
|
||||
255: 107, # 'ÿ'
|
||||
0: 255, # '\x00'
|
||||
1: 255, # '\x01'
|
||||
2: 255, # '\x02'
|
||||
3: 255, # '\x03'
|
||||
4: 255, # '\x04'
|
||||
5: 255, # '\x05'
|
||||
6: 255, # '\x06'
|
||||
7: 255, # '\x07'
|
||||
8: 255, # '\x08'
|
||||
9: 255, # '\t'
|
||||
10: 255, # '\n'
|
||||
11: 255, # '\x0b'
|
||||
12: 255, # '\x0c'
|
||||
13: 255, # '\r'
|
||||
14: 255, # '\x0e'
|
||||
15: 255, # '\x0f'
|
||||
16: 255, # '\x10'
|
||||
17: 255, # '\x11'
|
||||
18: 255, # '\x12'
|
||||
19: 255, # '\x13'
|
||||
20: 255, # '\x14'
|
||||
21: 255, # '\x15'
|
||||
22: 255, # '\x16'
|
||||
23: 255, # '\x17'
|
||||
24: 255, # '\x18'
|
||||
25: 255, # '\x19'
|
||||
26: 255, # '\x1a'
|
||||
27: 255, # '\x1b'
|
||||
28: 255, # '\x1c'
|
||||
29: 255, # '\x1d'
|
||||
30: 255, # '\x1e'
|
||||
31: 255, # '\x1f'
|
||||
32: 255, # ' '
|
||||
33: 255, # '!'
|
||||
34: 255, # '"'
|
||||
35: 255, # '#'
|
||||
36: 255, # '$'
|
||||
37: 255, # '%'
|
||||
38: 255, # '&'
|
||||
39: 255, # "'"
|
||||
40: 255, # '('
|
||||
41: 255, # ')'
|
||||
42: 255, # '*'
|
||||
43: 255, # '+'
|
||||
44: 255, # ','
|
||||
45: 255, # '-'
|
||||
46: 255, # '.'
|
||||
47: 255, # '/'
|
||||
48: 255, # '0'
|
||||
49: 255, # '1'
|
||||
50: 255, # '2'
|
||||
51: 255, # '3'
|
||||
52: 255, # '4'
|
||||
53: 255, # '5'
|
||||
54: 255, # '6'
|
||||
55: 255, # '7'
|
||||
56: 255, # '8'
|
||||
57: 255, # '9'
|
||||
58: 255, # ':'
|
||||
59: 255, # ';'
|
||||
60: 255, # '<'
|
||||
61: 255, # '='
|
||||
62: 255, # '>'
|
||||
63: 255, # '?'
|
||||
64: 255, # '@'
|
||||
65: 23, # 'A'
|
||||
66: 37, # 'B'
|
||||
67: 47, # 'C'
|
||||
68: 39, # 'D'
|
||||
69: 29, # 'E'
|
||||
70: 52, # 'F'
|
||||
71: 36, # 'G'
|
||||
72: 45, # 'H'
|
||||
73: 53, # 'I'
|
||||
74: 60, # 'J'
|
||||
75: 16, # 'K'
|
||||
76: 49, # 'L'
|
||||
77: 20, # 'M'
|
||||
78: 46, # 'N'
|
||||
79: 42, # 'O'
|
||||
80: 48, # 'P'
|
||||
81: 69, # 'Q'
|
||||
82: 44, # 'R'
|
||||
83: 35, # 'S'
|
||||
84: 31, # 'T'
|
||||
85: 51, # 'U'
|
||||
86: 38, # 'V'
|
||||
87: 62, # 'W'
|
||||
88: 65, # 'X'
|
||||
89: 43, # 'Y'
|
||||
90: 56, # 'Z'
|
||||
91: 255, # '['
|
||||
92: 255, # '\\'
|
||||
93: 255, # ']'
|
||||
94: 255, # '^'
|
||||
95: 255, # '_'
|
||||
96: 255, # '`'
|
||||
97: 1, # 'a'
|
||||
98: 21, # 'b'
|
||||
99: 28, # 'c'
|
||||
100: 12, # 'd'
|
||||
101: 2, # 'e'
|
||||
102: 18, # 'f'
|
||||
103: 27, # 'g'
|
||||
104: 25, # 'h'
|
||||
105: 3, # 'i'
|
||||
106: 24, # 'j'
|
||||
107: 10, # 'k'
|
||||
108: 5, # 'l'
|
||||
109: 13, # 'm'
|
||||
110: 4, # 'n'
|
||||
111: 15, # 'o'
|
||||
112: 26, # 'p'
|
||||
113: 64, # 'q'
|
||||
114: 7, # 'r'
|
||||
115: 8, # 's'
|
||||
116: 9, # 't'
|
||||
117: 14, # 'u'
|
||||
118: 32, # 'v'
|
||||
119: 57, # 'w'
|
||||
120: 58, # 'x'
|
||||
121: 11, # 'y'
|
||||
122: 22, # 'z'
|
||||
123: 255, # '{'
|
||||
124: 255, # '|'
|
||||
125: 255, # '}'
|
||||
126: 255, # '~'
|
||||
127: 255, # '\x7f'
|
||||
128: 180, # '\x80'
|
||||
129: 179, # '\x81'
|
||||
130: 178, # '\x82'
|
||||
131: 177, # '\x83'
|
||||
132: 176, # '\x84'
|
||||
133: 175, # '\x85'
|
||||
134: 174, # '\x86'
|
||||
135: 173, # '\x87'
|
||||
136: 172, # '\x88'
|
||||
137: 171, # '\x89'
|
||||
138: 170, # '\x8a'
|
||||
139: 169, # '\x8b'
|
||||
140: 168, # '\x8c'
|
||||
141: 167, # '\x8d'
|
||||
142: 166, # '\x8e'
|
||||
143: 165, # '\x8f'
|
||||
144: 164, # '\x90'
|
||||
145: 163, # '\x91'
|
||||
146: 162, # '\x92'
|
||||
147: 161, # '\x93'
|
||||
148: 160, # '\x94'
|
||||
149: 159, # '\x95'
|
||||
150: 101, # '\x96'
|
||||
151: 158, # '\x97'
|
||||
152: 157, # '\x98'
|
||||
153: 156, # '\x99'
|
||||
154: 155, # '\x9a'
|
||||
155: 154, # '\x9b'
|
||||
156: 153, # '\x9c'
|
||||
157: 152, # '\x9d'
|
||||
158: 151, # '\x9e'
|
||||
159: 106, # '\x9f'
|
||||
160: 150, # '\xa0'
|
||||
161: 149, # '¡'
|
||||
162: 148, # '¢'
|
||||
163: 147, # '£'
|
||||
164: 146, # '¤'
|
||||
165: 145, # '¥'
|
||||
166: 144, # '¦'
|
||||
167: 100, # '§'
|
||||
168: 143, # '¨'
|
||||
169: 142, # '©'
|
||||
170: 141, # 'ª'
|
||||
171: 140, # '«'
|
||||
172: 139, # '¬'
|
||||
173: 138, # '\xad'
|
||||
174: 137, # '®'
|
||||
175: 136, # '¯'
|
||||
176: 94, # '°'
|
||||
177: 80, # '±'
|
||||
178: 93, # '²'
|
||||
179: 135, # '³'
|
||||
180: 105, # '´'
|
||||
181: 134, # 'µ'
|
||||
182: 133, # '¶'
|
||||
183: 63, # '·'
|
||||
184: 132, # '¸'
|
||||
185: 131, # '¹'
|
||||
186: 130, # 'º'
|
||||
187: 129, # '»'
|
||||
188: 128, # '¼'
|
||||
189: 127, # '½'
|
||||
190: 126, # '¾'
|
||||
191: 125, # '¿'
|
||||
192: 124, # 'À'
|
||||
193: 104, # 'Á'
|
||||
194: 73, # 'Â'
|
||||
195: 99, # 'Ã'
|
||||
196: 79, # 'Ä'
|
||||
197: 85, # 'Å'
|
||||
198: 123, # 'Æ'
|
||||
199: 54, # 'Ç'
|
||||
200: 122, # 'È'
|
||||
201: 98, # 'É'
|
||||
202: 92, # 'Ê'
|
||||
203: 121, # 'Ë'
|
||||
204: 120, # 'Ì'
|
||||
205: 91, # 'Í'
|
||||
206: 103, # 'Î'
|
||||
207: 119, # 'Ï'
|
||||
208: 68, # 'Ğ'
|
||||
209: 118, # 'Ñ'
|
||||
210: 117, # 'Ò'
|
||||
211: 97, # 'Ó'
|
||||
212: 116, # 'Ô'
|
||||
213: 115, # 'Õ'
|
||||
214: 50, # 'Ö'
|
||||
215: 90, # '×'
|
||||
216: 114, # 'Ø'
|
||||
217: 113, # 'Ù'
|
||||
218: 112, # 'Ú'
|
||||
219: 111, # 'Û'
|
||||
220: 55, # 'Ü'
|
||||
221: 41, # 'İ'
|
||||
222: 40, # 'Ş'
|
||||
223: 86, # 'ß'
|
||||
224: 89, # 'à'
|
||||
225: 70, # 'á'
|
||||
226: 59, # 'â'
|
||||
227: 78, # 'ã'
|
||||
228: 71, # 'ä'
|
||||
229: 82, # 'å'
|
||||
230: 88, # 'æ'
|
||||
231: 33, # 'ç'
|
||||
232: 77, # 'è'
|
||||
233: 66, # 'é'
|
||||
234: 84, # 'ê'
|
||||
235: 83, # 'ë'
|
||||
236: 110, # 'ì'
|
||||
237: 75, # 'í'
|
||||
238: 61, # 'î'
|
||||
239: 96, # 'ï'
|
||||
240: 30, # 'ğ'
|
||||
241: 67, # 'ñ'
|
||||
242: 109, # 'ò'
|
||||
243: 74, # 'ó'
|
||||
244: 87, # 'ô'
|
||||
245: 102, # 'õ'
|
||||
246: 34, # 'ö'
|
||||
247: 95, # '÷'
|
||||
248: 81, # 'ø'
|
||||
249: 108, # 'ù'
|
||||
250: 76, # 'ú'
|
||||
251: 72, # 'û'
|
||||
252: 17, # 'ü'
|
||||
253: 6, # 'ı'
|
||||
254: 19, # 'ş'
|
||||
255: 107, # 'ÿ'
|
||||
}
|
||||
|
||||
ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel(charset_name='ISO-8859-9',
|
||||
language='Turkish',
|
||||
char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER,
|
||||
language_model=TURKISH_LANG_MODEL,
|
||||
typical_positive_ratio=0.97029,
|
||||
keep_ascii_letters=True,
|
||||
alphabet='ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş')
|
||||
|
||||
ISO_8859_9_TURKISH_MODEL = SingleByteCharSetModel(
|
||||
charset_name="ISO-8859-9",
|
||||
language="Turkish",
|
||||
char_to_order_map=ISO_8859_9_TURKISH_CHAR_TO_ORDER,
|
||||
language_model=TURKISH_LANG_MODEL,
|
||||
typical_positive_ratio=0.97029,
|
||||
keep_ascii_letters=True,
|
||||
alphabet="ABCDEFGHIJKLMNOPRSTUVYZabcdefghijklmnoprstuvyzÂÇÎÖÛÜâçîöûüĞğİıŞş",
|
||||
)
|
||||
|
||||
@@ -26,6 +26,8 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import List, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState
|
||||
|
||||
@@ -41,6 +43,7 @@ ASV = 6 # accent small vowel
|
||||
ASO = 7 # accent small other
|
||||
CLASS_NUM = 8 # total classes
|
||||
|
||||
# fmt: off
|
||||
Latin1_CharToClass = (
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
||||
@@ -91,34 +94,34 @@ Latin1ClassModel = (
|
||||
0, 3, 1, 3, 1, 1, 1, 3, # ASV
|
||||
0, 3, 1, 3, 1, 1, 3, 3, # ASO
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
|
||||
class Latin1Prober(CharSetProber):
|
||||
def __init__(self):
|
||||
super(Latin1Prober, self).__init__()
|
||||
self._last_char_class = None
|
||||
self._freq_counter = None
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._last_char_class = OTH
|
||||
self._freq_counter: List[int] = []
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
self._last_char_class = OTH
|
||||
self._freq_counter = [0] * FREQ_CAT_NUM
|
||||
CharSetProber.reset(self)
|
||||
super().reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "ISO-8859-1"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return ""
|
||||
|
||||
def feed(self, byte_str):
|
||||
byte_str = self.filter_with_english_letters(byte_str)
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
byte_str = self.remove_xml_tags(byte_str)
|
||||
for c in byte_str:
|
||||
char_class = Latin1_CharToClass[c]
|
||||
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
|
||||
+ char_class]
|
||||
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) + char_class]
|
||||
if freq == 0:
|
||||
self._state = ProbingState.NOT_ME
|
||||
break
|
||||
@@ -127,19 +130,18 @@ class Latin1Prober(CharSetProber):
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
if self.state == ProbingState.NOT_ME:
|
||||
return 0.01
|
||||
|
||||
total = sum(self._freq_counter)
|
||||
if total < 0.01:
|
||||
confidence = 0.0
|
||||
else:
|
||||
confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0)
|
||||
/ total)
|
||||
if confidence < 0.0:
|
||||
confidence = 0.0
|
||||
confidence = (
|
||||
0.0
|
||||
if total < 0.01
|
||||
else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
|
||||
)
|
||||
confidence = max(confidence, 0.0)
|
||||
# lower the confidence of latin1 so that other more accurate
|
||||
# detector can take priority.
|
||||
confidence = confidence * 0.73
|
||||
confidence *= 0.73
|
||||
return confidence
|
||||
|
||||
@@ -27,8 +27,12 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from .chardistribution import CharDistributionAnalysis
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState, MachineState
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import LanguageFilter, MachineState, ProbingState
|
||||
|
||||
|
||||
class MultiByteCharSetProber(CharSetProber):
|
||||
@@ -36,56 +40,56 @@ class MultiByteCharSetProber(CharSetProber):
|
||||
MultiByteCharSetProber
|
||||
"""
|
||||
|
||||
def __init__(self, lang_filter=None):
|
||||
super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter)
|
||||
self.distribution_analyzer = None
|
||||
self.coding_sm = None
|
||||
self._last_char = [0, 0]
|
||||
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||
super().__init__(lang_filter=lang_filter)
|
||||
self.distribution_analyzer: Optional[CharDistributionAnalysis] = None
|
||||
self.coding_sm: Optional[CodingStateMachine] = None
|
||||
self._last_char = bytearray(b"\0\0")
|
||||
|
||||
def reset(self):
|
||||
super(MultiByteCharSetProber, self).reset()
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
if self.coding_sm:
|
||||
self.coding_sm.reset()
|
||||
if self.distribution_analyzer:
|
||||
self.distribution_analyzer.reset()
|
||||
self._last_char = [0, 0]
|
||||
self._last_char = bytearray(b"\0\0")
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
raise NotImplementedError
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
assert self.coding_sm is not None
|
||||
assert self.distribution_analyzer is not None
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def feed(self, byte_str):
|
||||
for i in range(len(byte_str)):
|
||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
||||
for i, byte in enumerate(byte_str):
|
||||
coding_state = self.coding_sm.next_state(byte)
|
||||
if coding_state == MachineState.ERROR:
|
||||
self.logger.debug('%s %s prober hit error at byte %s',
|
||||
self.charset_name, self.language, i)
|
||||
self.logger.debug(
|
||||
"%s %s prober hit error at byte %s",
|
||||
self.charset_name,
|
||||
self.language,
|
||||
i,
|
||||
)
|
||||
self._state = ProbingState.NOT_ME
|
||||
break
|
||||
elif coding_state == MachineState.ITS_ME:
|
||||
if coding_state == MachineState.ITS_ME:
|
||||
self._state = ProbingState.FOUND_IT
|
||||
break
|
||||
elif coding_state == MachineState.START:
|
||||
if coding_state == MachineState.START:
|
||||
char_len = self.coding_sm.get_current_charlen()
|
||||
if i == 0:
|
||||
self._last_char[1] = byte_str[0]
|
||||
self._last_char[1] = byte
|
||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||
else:
|
||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
||||
char_len)
|
||||
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||
|
||||
self._last_char[0] = byte_str[-1]
|
||||
|
||||
if self.state == ProbingState.DETECTING:
|
||||
if (self.distribution_analyzer.got_enough_data() and
|
||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
||||
if self.distribution_analyzer.got_enough_data() and (
|
||||
self.get_confidence() > self.SHORTCUT_THRESHOLD
|
||||
):
|
||||
self._state = ProbingState.FOUND_IT
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
assert self.distribution_analyzer is not None
|
||||
return self.distribution_analyzer.get_confidence()
|
||||
|
||||
@@ -27,20 +27,22 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .utf8prober import UTF8Prober
|
||||
from .sjisprober import SJISProber
|
||||
from .eucjpprober import EUCJPProber
|
||||
from .gb2312prober import GB2312Prober
|
||||
from .euckrprober import EUCKRProber
|
||||
from .cp949prober import CP949Prober
|
||||
from .big5prober import Big5Prober
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .cp949prober import CP949Prober
|
||||
from .enums import LanguageFilter
|
||||
from .eucjpprober import EUCJPProber
|
||||
from .euckrprober import EUCKRProber
|
||||
from .euctwprober import EUCTWProber
|
||||
from .gb2312prober import GB2312Prober
|
||||
from .johabprober import JOHABProber
|
||||
from .sjisprober import SJISProber
|
||||
from .utf8prober import UTF8Prober
|
||||
|
||||
|
||||
class MBCSGroupProber(CharSetGroupProber):
|
||||
def __init__(self, lang_filter=None):
|
||||
super(MBCSGroupProber, self).__init__(lang_filter=lang_filter)
|
||||
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||
super().__init__(lang_filter=lang_filter)
|
||||
self.probers = [
|
||||
UTF8Prober(),
|
||||
SJISProber(),
|
||||
@@ -49,6 +51,7 @@ class MBCSGroupProber(CharSetGroupProber):
|
||||
EUCKRProber(),
|
||||
CP949Prober(),
|
||||
Big5Prober(),
|
||||
EUCTWProber()
|
||||
EUCTWProber(),
|
||||
JOHABProber(),
|
||||
]
|
||||
self.reset()
|
||||
|
||||
@@ -25,43 +25,45 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .codingstatemachinedict import CodingStateMachineDict
|
||||
from .enums import MachineState
|
||||
|
||||
# BIG5
|
||||
|
||||
# fmt: off
|
||||
BIG5_CLS = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as legal value
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||
4,4,4,4,4,4,4,4, # 80 - 87
|
||||
4,4,4,4,4,4,4,4, # 88 - 8f
|
||||
4,4,4,4,4,4,4,4, # 90 - 97
|
||||
4,4,4,4,4,4,4,4, # 98 - 9f
|
||||
4,3,3,3,3,3,3,3, # a0 - a7
|
||||
3,3,3,3,3,3,3,3, # a8 - af
|
||||
3,3,3,3,3,3,3,3, # b0 - b7
|
||||
3,3,3,3,3,3,3,3, # b8 - bf
|
||||
3,3,3,3,3,3,3,3, # c0 - c7
|
||||
3,3,3,3,3,3,3,3, # c8 - cf
|
||||
3,3,3,3,3,3,3,3, # d0 - d7
|
||||
3,3,3,3,3,3,3,3, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,3,3,3, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,3,3,0 # f8 - ff
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as legal value
|
||||
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
|
||||
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
|
||||
2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 80 - 87
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 88 - 8f
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 90 - 97
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 98 - 9f
|
||||
4, 3, 3, 3, 3, 3, 3, 3, # a0 - a7
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # a8 - af
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # b0 - b7
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # b8 - bf
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # c0 - c7
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # c8 - cf
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # d0 - d7
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # d8 - df
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # e8 - ef
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
|
||||
3, 3, 3, 3, 3, 3, 3, 0 # f8 - ff
|
||||
)
|
||||
|
||||
BIG5_ST = (
|
||||
@@ -69,34 +71,37 @@ BIG5_ST = (
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
|
||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
||||
|
||||
BIG5_SM_MODEL = {'class_table': BIG5_CLS,
|
||||
'class_factor': 5,
|
||||
'state_table': BIG5_ST,
|
||||
'char_len_table': BIG5_CHAR_LEN_TABLE,
|
||||
'name': 'Big5'}
|
||||
BIG5_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": BIG5_CLS,
|
||||
"class_factor": 5,
|
||||
"state_table": BIG5_ST,
|
||||
"char_len_table": BIG5_CHAR_LEN_TABLE,
|
||||
"name": "Big5",
|
||||
}
|
||||
|
||||
# CP949
|
||||
|
||||
# fmt: off
|
||||
CP949_CLS = (
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
|
||||
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
|
||||
1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
|
||||
4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
|
||||
1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
|
||||
5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
|
||||
0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
|
||||
6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
|
||||
6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
|
||||
7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
|
||||
7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
|
||||
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, # 00 - 0f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, # 10 - 1f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 2f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 3f
|
||||
1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, # 40 - 4f
|
||||
4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 50 - 5f
|
||||
1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 60 - 6f
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 70 - 7f
|
||||
0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 80 - 8f
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 90 - 9f
|
||||
6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, # a0 - af
|
||||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # b0 - bf
|
||||
7, 7, 7, 7, 7, 7, 9, 2, 2, 3, 2, 2, 2, 2, 2, 2, # c0 - cf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # d0 - df
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # e0 - ef
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, # f0 - ff
|
||||
)
|
||||
|
||||
CP949_ST = (
|
||||
@@ -109,50 +114,53 @@ CP949_ST = (
|
||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START, # 5
|
||||
MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START, # 6
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
||||
|
||||
CP949_SM_MODEL = {'class_table': CP949_CLS,
|
||||
'class_factor': 10,
|
||||
'state_table': CP949_ST,
|
||||
'char_len_table': CP949_CHAR_LEN_TABLE,
|
||||
'name': 'CP949'}
|
||||
CP949_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": CP949_CLS,
|
||||
"class_factor": 10,
|
||||
"state_table": CP949_ST,
|
||||
"char_len_table": CP949_CHAR_LEN_TABLE,
|
||||
"name": "CP949",
|
||||
}
|
||||
|
||||
# EUC-JP
|
||||
|
||||
# fmt: off
|
||||
EUCJP_CLS = (
|
||||
4,4,4,4,4,4,4,4, # 00 - 07
|
||||
4,4,4,4,4,4,5,5, # 08 - 0f
|
||||
4,4,4,4,4,4,4,4, # 10 - 17
|
||||
4,4,4,5,4,4,4,4, # 18 - 1f
|
||||
4,4,4,4,4,4,4,4, # 20 - 27
|
||||
4,4,4,4,4,4,4,4, # 28 - 2f
|
||||
4,4,4,4,4,4,4,4, # 30 - 37
|
||||
4,4,4,4,4,4,4,4, # 38 - 3f
|
||||
4,4,4,4,4,4,4,4, # 40 - 47
|
||||
4,4,4,4,4,4,4,4, # 48 - 4f
|
||||
4,4,4,4,4,4,4,4, # 50 - 57
|
||||
4,4,4,4,4,4,4,4, # 58 - 5f
|
||||
4,4,4,4,4,4,4,4, # 60 - 67
|
||||
4,4,4,4,4,4,4,4, # 68 - 6f
|
||||
4,4,4,4,4,4,4,4, # 70 - 77
|
||||
4,4,4,4,4,4,4,4, # 78 - 7f
|
||||
5,5,5,5,5,5,5,5, # 80 - 87
|
||||
5,5,5,5,5,5,1,3, # 88 - 8f
|
||||
5,5,5,5,5,5,5,5, # 90 - 97
|
||||
5,5,5,5,5,5,5,5, # 98 - 9f
|
||||
5,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,0,5 # f8 - ff
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 00 - 07
|
||||
4, 4, 4, 4, 4, 4, 5, 5, # 08 - 0f
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 10 - 17
|
||||
4, 4, 4, 5, 4, 4, 4, 4, # 18 - 1f
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 20 - 27
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 28 - 2f
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 30 - 37
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 38 - 3f
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 40 - 47
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 48 - 4f
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 50 - 57
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 58 - 5f
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 60 - 67
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 68 - 6f
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 70 - 77
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 78 - 7f
|
||||
5, 5, 5, 5, 5, 5, 5, 5, # 80 - 87
|
||||
5, 5, 5, 5, 5, 5, 1, 3, # 88 - 8f
|
||||
5, 5, 5, 5, 5, 5, 5, 5, # 90 - 97
|
||||
5, 5, 5, 5, 5, 5, 5, 5, # 98 - 9f
|
||||
5, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
|
||||
0, 0, 0, 0, 0, 0, 0, 5 # f8 - ff
|
||||
)
|
||||
|
||||
EUCJP_ST = (
|
||||
@@ -162,100 +170,163 @@ EUCJP_ST = (
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
|
||||
3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
||||
|
||||
EUCJP_SM_MODEL = {'class_table': EUCJP_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': EUCJP_ST,
|
||||
'char_len_table': EUCJP_CHAR_LEN_TABLE,
|
||||
'name': 'EUC-JP'}
|
||||
EUCJP_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": EUCJP_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": EUCJP_ST,
|
||||
"char_len_table": EUCJP_CHAR_LEN_TABLE,
|
||||
"name": "EUC-JP",
|
||||
}
|
||||
|
||||
# EUC-KR
|
||||
|
||||
# fmt: off
|
||||
EUCKR_CLS = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
1,1,1,1,1,1,1,1, # 40 - 47
|
||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||
1,1,1,1,1,1,1,1, # 50 - 57
|
||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||
1,1,1,1,1,1,1,1, # 60 - 67
|
||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||
1,1,1,1,1,1,1,1, # 70 - 77
|
||||
1,1,1,1,1,1,1,1, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,3,3,3, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,3,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
2,2,2,2,2,2,2,2, # e0 - e7
|
||||
2,2,2,2,2,2,2,2, # e8 - ef
|
||||
2,2,2,2,2,2,2,2, # f0 - f7
|
||||
2,2,2,2,2,2,2,0 # f8 - ff
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
||||
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
|
||||
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 40 - 47
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 48 - 4f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 50 - 57
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 58 - 5f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 60 - 67
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 68 - 6f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 70 - 77
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 78 - 7f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
|
||||
0, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||
2, 2, 2, 2, 2, 3, 3, 3, # a8 - af
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||
2, 3, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
|
||||
2, 2, 2, 2, 2, 2, 2, 0 # f8 - ff
|
||||
)
|
||||
|
||||
EUCKR_ST = (
|
||||
MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
||||
|
||||
EUCKR_SM_MODEL = {'class_table': EUCKR_CLS,
|
||||
'class_factor': 4,
|
||||
'state_table': EUCKR_ST,
|
||||
'char_len_table': EUCKR_CHAR_LEN_TABLE,
|
||||
'name': 'EUC-KR'}
|
||||
EUCKR_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": EUCKR_CLS,
|
||||
"class_factor": 4,
|
||||
"state_table": EUCKR_ST,
|
||||
"char_len_table": EUCKR_CHAR_LEN_TABLE,
|
||||
"name": "EUC-KR",
|
||||
}
|
||||
|
||||
# JOHAB
|
||||
# fmt: off
|
||||
JOHAB_CLS = (
|
||||
4,4,4,4,4,4,4,4, # 00 - 07
|
||||
4,4,4,4,4,4,0,0, # 08 - 0f
|
||||
4,4,4,4,4,4,4,4, # 10 - 17
|
||||
4,4,4,0,4,4,4,4, # 18 - 1f
|
||||
4,4,4,4,4,4,4,4, # 20 - 27
|
||||
4,4,4,4,4,4,4,4, # 28 - 2f
|
||||
4,3,3,3,3,3,3,3, # 30 - 37
|
||||
3,3,3,3,3,3,3,3, # 38 - 3f
|
||||
3,1,1,1,1,1,1,1, # 40 - 47
|
||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||
1,1,1,1,1,1,1,1, # 50 - 57
|
||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||
1,1,1,1,1,1,1,1, # 60 - 67
|
||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||
1,1,1,1,1,1,1,1, # 70 - 77
|
||||
1,1,1,1,1,1,1,2, # 78 - 7f
|
||||
6,6,6,6,8,8,8,8, # 80 - 87
|
||||
8,8,8,8,8,8,8,8, # 88 - 8f
|
||||
8,7,7,7,7,7,7,7, # 90 - 97
|
||||
7,7,7,7,7,7,7,7, # 98 - 9f
|
||||
7,7,7,7,7,7,7,7, # a0 - a7
|
||||
7,7,7,7,7,7,7,7, # a8 - af
|
||||
7,7,7,7,7,7,7,7, # b0 - b7
|
||||
7,7,7,7,7,7,7,7, # b8 - bf
|
||||
7,7,7,7,7,7,7,7, # c0 - c7
|
||||
7,7,7,7,7,7,7,7, # c8 - cf
|
||||
7,7,7,7,5,5,5,5, # d0 - d7
|
||||
5,9,9,9,9,9,9,5, # d8 - df
|
||||
9,9,9,9,9,9,9,9, # e0 - e7
|
||||
9,9,9,9,9,9,9,9, # e8 - ef
|
||||
9,9,9,9,9,9,9,9, # f0 - f7
|
||||
9,9,5,5,5,5,5,0 # f8 - ff
|
||||
)
|
||||
|
||||
JOHAB_ST = (
|
||||
# cls = 0 1 2 3 4 5 6 7 8 9
|
||||
MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,3 ,3 ,4 , # MachineState.START
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
|
||||
MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR , # MachineState.ERROR
|
||||
MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START , # 3
|
||||
MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START , # 4
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)
|
||||
|
||||
JOHAB_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": JOHAB_CLS,
|
||||
"class_factor": 10,
|
||||
"state_table": JOHAB_ST,
|
||||
"char_len_table": JOHAB_CHAR_LEN_TABLE,
|
||||
"name": "Johab",
|
||||
}
|
||||
|
||||
# EUC-TW
|
||||
|
||||
# fmt: off
|
||||
EUCTW_CLS = (
|
||||
2,2,2,2,2,2,2,2, # 00 - 07
|
||||
2,2,2,2,2,2,0,0, # 08 - 0f
|
||||
2,2,2,2,2,2,2,2, # 10 - 17
|
||||
2,2,2,0,2,2,2,2, # 18 - 1f
|
||||
2,2,2,2,2,2,2,2, # 20 - 27
|
||||
2,2,2,2,2,2,2,2, # 28 - 2f
|
||||
2,2,2,2,2,2,2,2, # 30 - 37
|
||||
2,2,2,2,2,2,2,2, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,2, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,6,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,3,4,4,4,4,4,4, # a0 - a7
|
||||
5,5,1,1,1,1,1,1, # a8 - af
|
||||
1,1,1,1,1,1,1,1, # b0 - b7
|
||||
1,1,1,1,1,1,1,1, # b8 - bf
|
||||
1,1,3,1,3,3,3,3, # c0 - c7
|
||||
3,3,3,3,3,3,3,3, # c8 - cf
|
||||
3,3,3,3,3,3,3,3, # d0 - d7
|
||||
3,3,3,3,3,3,3,3, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,3,3,3, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,3,3,0 # f8 - ff
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 00 - 07
|
||||
2, 2, 2, 2, 2, 2, 0, 0, # 08 - 0f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 10 - 17
|
||||
2, 2, 2, 0, 2, 2, 2, 2, # 18 - 1f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 20 - 27
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 28 - 2f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 30 - 37
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 38 - 3f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 78 - 7f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
|
||||
0, 0, 0, 0, 0, 0, 6, 0, # 88 - 8f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
|
||||
0, 3, 4, 4, 4, 4, 4, 4, # a0 - a7
|
||||
5, 5, 1, 1, 1, 1, 1, 1, # a8 - af
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # b0 - b7
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # b8 - bf
|
||||
1, 1, 3, 1, 3, 3, 3, 3, # c0 - c7
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # c8 - cf
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # d0 - d7
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # d8 - df
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # e8 - ef
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
|
||||
3, 3, 3, 3, 3, 3, 3, 0 # f8 - ff
|
||||
)
|
||||
|
||||
EUCTW_ST = (
|
||||
@@ -266,50 +337,53 @@ EUCTW_ST = (
|
||||
5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,#20-27
|
||||
MachineState.START,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
||||
|
||||
EUCTW_SM_MODEL = {'class_table': EUCTW_CLS,
|
||||
'class_factor': 7,
|
||||
'state_table': EUCTW_ST,
|
||||
'char_len_table': EUCTW_CHAR_LEN_TABLE,
|
||||
'name': 'x-euc-tw'}
|
||||
EUCTW_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": EUCTW_CLS,
|
||||
"class_factor": 7,
|
||||
"state_table": EUCTW_ST,
|
||||
"char_len_table": EUCTW_CHAR_LEN_TABLE,
|
||||
"name": "x-euc-tw",
|
||||
}
|
||||
|
||||
# GB2312
|
||||
|
||||
# fmt: off
|
||||
GB2312_CLS = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
3,3,3,3,3,3,3,3, # 30 - 37
|
||||
3,3,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,4, # 78 - 7f
|
||||
5,6,6,6,6,6,6,6, # 80 - 87
|
||||
6,6,6,6,6,6,6,6, # 88 - 8f
|
||||
6,6,6,6,6,6,6,6, # 90 - 97
|
||||
6,6,6,6,6,6,6,6, # 98 - 9f
|
||||
6,6,6,6,6,6,6,6, # a0 - a7
|
||||
6,6,6,6,6,6,6,6, # a8 - af
|
||||
6,6,6,6,6,6,6,6, # b0 - b7
|
||||
6,6,6,6,6,6,6,6, # b8 - bf
|
||||
6,6,6,6,6,6,6,6, # c0 - c7
|
||||
6,6,6,6,6,6,6,6, # c8 - cf
|
||||
6,6,6,6,6,6,6,6, # d0 - d7
|
||||
6,6,6,6,6,6,6,6, # d8 - df
|
||||
6,6,6,6,6,6,6,6, # e0 - e7
|
||||
6,6,6,6,6,6,6,6, # e8 - ef
|
||||
6,6,6,6,6,6,6,6, # f0 - f7
|
||||
6,6,6,6,6,6,6,0 # f8 - ff
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
||||
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
|
||||
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # 30 - 37
|
||||
3, 3, 1, 1, 1, 1, 1, 1, # 38 - 3f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
|
||||
2, 2, 2, 2, 2, 2, 2, 4, # 78 - 7f
|
||||
5, 6, 6, 6, 6, 6, 6, 6, # 80 - 87
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # 88 - 8f
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # 90 - 97
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # 98 - 9f
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # a0 - a7
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # a8 - af
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # b0 - b7
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # b8 - bf
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # c0 - c7
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # c8 - cf
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # d0 - d7
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # d8 - df
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # e0 - e7
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # e8 - ef
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # f0 - f7
|
||||
6, 6, 6, 6, 6, 6, 6, 0 # f8 - ff
|
||||
)
|
||||
|
||||
GB2312_ST = (
|
||||
@@ -320,6 +394,7 @@ GB2312_ST = (
|
||||
MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#20-27
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START #28-2f
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
# To be accurate, the length of class 6 can be either 2 or 4.
|
||||
# But it is not necessary to discriminate between the two since
|
||||
@@ -328,100 +403,105 @@ GB2312_ST = (
|
||||
# 2 here.
|
||||
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
|
||||
|
||||
GB2312_SM_MODEL = {'class_table': GB2312_CLS,
|
||||
'class_factor': 7,
|
||||
'state_table': GB2312_ST,
|
||||
'char_len_table': GB2312_CHAR_LEN_TABLE,
|
||||
'name': 'GB2312'}
|
||||
GB2312_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": GB2312_CLS,
|
||||
"class_factor": 7,
|
||||
"state_table": GB2312_ST,
|
||||
"char_len_table": GB2312_CHAR_LEN_TABLE,
|
||||
"name": "GB2312",
|
||||
}
|
||||
|
||||
# Shift_JIS
|
||||
|
||||
# fmt: off
|
||||
SJIS_CLS = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
2,2,2,2,2,2,2,2, # 40 - 47
|
||||
2,2,2,2,2,2,2,2, # 48 - 4f
|
||||
2,2,2,2,2,2,2,2, # 50 - 57
|
||||
2,2,2,2,2,2,2,2, # 58 - 5f
|
||||
2,2,2,2,2,2,2,2, # 60 - 67
|
||||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||
3,3,3,3,3,2,2,3, # 80 - 87
|
||||
3,3,3,3,3,3,3,3, # 88 - 8f
|
||||
3,3,3,3,3,3,3,3, # 90 - 97
|
||||
3,3,3,3,3,3,3,3, # 98 - 9f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
|
||||
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
|
||||
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
|
||||
2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f
|
||||
3, 3, 3, 3, 3, 2, 2, 3, # 80 - 87
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # 88 - 8f
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # 90 - 97
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # 98 - 9f
|
||||
#0xa0 is illegal in sjis encoding, but some pages does
|
||||
#contain such byte. We need to be more error forgiven.
|
||||
2,2,2,2,2,2,2,2, # a0 - a7
|
||||
2,2,2,2,2,2,2,2, # a8 - af
|
||||
2,2,2,2,2,2,2,2, # b0 - b7
|
||||
2,2,2,2,2,2,2,2, # b8 - bf
|
||||
2,2,2,2,2,2,2,2, # c0 - c7
|
||||
2,2,2,2,2,2,2,2, # c8 - cf
|
||||
2,2,2,2,2,2,2,2, # d0 - d7
|
||||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,4,4,4, # e8 - ef
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,0,0,0) # f8 - ff
|
||||
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
|
||||
2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
|
||||
3, 3, 3, 3, 3, 4, 4, 4, # e8 - ef
|
||||
3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
|
||||
3, 3, 3, 3, 3, 0, 0, 0, # f8 - ff
|
||||
)
|
||||
|
||||
SJIS_ST = (
|
||||
MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
|
||||
MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
||||
|
||||
SJIS_SM_MODEL = {'class_table': SJIS_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': SJIS_ST,
|
||||
'char_len_table': SJIS_CHAR_LEN_TABLE,
|
||||
'name': 'Shift_JIS'}
|
||||
SJIS_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": SJIS_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": SJIS_ST,
|
||||
"char_len_table": SJIS_CHAR_LEN_TABLE,
|
||||
"name": "Shift_JIS",
|
||||
}
|
||||
|
||||
# UCS2-BE
|
||||
|
||||
# fmt: off
|
||||
UCS2BE_CLS = (
|
||||
0,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,3,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,3,3,3,3,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,0,0,0,0,0,0,0, # a0 - a7
|
||||
0,0,0,0,0,0,0,0, # a8 - af
|
||||
0,0,0,0,0,0,0,0, # b0 - b7
|
||||
0,0,0,0,0,0,0,0, # b8 - bf
|
||||
0,0,0,0,0,0,0,0, # c0 - c7
|
||||
0,0,0,0,0,0,0,0, # c8 - cf
|
||||
0,0,0,0,0,0,0,0, # d0 - d7
|
||||
0,0,0,0,0,0,0,0, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,4,5 # f8 - ff
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||
0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||
0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
|
||||
0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # a8 - af
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # d8 - df
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
|
||||
0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff
|
||||
)
|
||||
|
||||
UCS2BE_ST = (
|
||||
@@ -433,50 +513,53 @@ UCS2BE_ST = (
|
||||
5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
|
||||
6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
||||
|
||||
UCS2BE_SM_MODEL = {'class_table': UCS2BE_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': UCS2BE_ST,
|
||||
'char_len_table': UCS2BE_CHAR_LEN_TABLE,
|
||||
'name': 'UTF-16BE'}
|
||||
UCS2BE_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": UCS2BE_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": UCS2BE_ST,
|
||||
"char_len_table": UCS2BE_CHAR_LEN_TABLE,
|
||||
"name": "UTF-16BE",
|
||||
}
|
||||
|
||||
# UCS2-LE
|
||||
|
||||
# fmt: off
|
||||
UCS2LE_CLS = (
|
||||
0,0,0,0,0,0,0,0, # 00 - 07
|
||||
0,0,1,0,0,2,0,0, # 08 - 0f
|
||||
0,0,0,0,0,0,0,0, # 10 - 17
|
||||
0,0,0,3,0,0,0,0, # 18 - 1f
|
||||
0,0,0,0,0,0,0,0, # 20 - 27
|
||||
0,3,3,3,3,3,0,0, # 28 - 2f
|
||||
0,0,0,0,0,0,0,0, # 30 - 37
|
||||
0,0,0,0,0,0,0,0, # 38 - 3f
|
||||
0,0,0,0,0,0,0,0, # 40 - 47
|
||||
0,0,0,0,0,0,0,0, # 48 - 4f
|
||||
0,0,0,0,0,0,0,0, # 50 - 57
|
||||
0,0,0,0,0,0,0,0, # 58 - 5f
|
||||
0,0,0,0,0,0,0,0, # 60 - 67
|
||||
0,0,0,0,0,0,0,0, # 68 - 6f
|
||||
0,0,0,0,0,0,0,0, # 70 - 77
|
||||
0,0,0,0,0,0,0,0, # 78 - 7f
|
||||
0,0,0,0,0,0,0,0, # 80 - 87
|
||||
0,0,0,0,0,0,0,0, # 88 - 8f
|
||||
0,0,0,0,0,0,0,0, # 90 - 97
|
||||
0,0,0,0,0,0,0,0, # 98 - 9f
|
||||
0,0,0,0,0,0,0,0, # a0 - a7
|
||||
0,0,0,0,0,0,0,0, # a8 - af
|
||||
0,0,0,0,0,0,0,0, # b0 - b7
|
||||
0,0,0,0,0,0,0,0, # b8 - bf
|
||||
0,0,0,0,0,0,0,0, # c0 - c7
|
||||
0,0,0,0,0,0,0,0, # c8 - cf
|
||||
0,0,0,0,0,0,0,0, # d0 - d7
|
||||
0,0,0,0,0,0,0,0, # d8 - df
|
||||
0,0,0,0,0,0,0,0, # e0 - e7
|
||||
0,0,0,0,0,0,0,0, # e8 - ef
|
||||
0,0,0,0,0,0,0,0, # f0 - f7
|
||||
0,0,0,0,0,0,4,5 # f8 - ff
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
|
||||
0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
|
||||
0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
|
||||
0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # a8 - af
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # d8 - df
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
|
||||
0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
|
||||
0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff
|
||||
)
|
||||
|
||||
UCS2LE_ST = (
|
||||
@@ -488,50 +571,53 @@ UCS2LE_ST = (
|
||||
5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
|
||||
5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
||||
|
||||
UCS2LE_SM_MODEL = {'class_table': UCS2LE_CLS,
|
||||
'class_factor': 6,
|
||||
'state_table': UCS2LE_ST,
|
||||
'char_len_table': UCS2LE_CHAR_LEN_TABLE,
|
||||
'name': 'UTF-16LE'}
|
||||
UCS2LE_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": UCS2LE_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": UCS2LE_ST,
|
||||
"char_len_table": UCS2LE_CHAR_LEN_TABLE,
|
||||
"name": "UTF-16LE",
|
||||
}
|
||||
|
||||
# UTF-8
|
||||
|
||||
# fmt: off
|
||||
UTF8_CLS = (
|
||||
1,1,1,1,1,1,1,1, # 00 - 07 #allow 0x00 as a legal value
|
||||
1,1,1,1,1,1,0,0, # 08 - 0f
|
||||
1,1,1,1,1,1,1,1, # 10 - 17
|
||||
1,1,1,0,1,1,1,1, # 18 - 1f
|
||||
1,1,1,1,1,1,1,1, # 20 - 27
|
||||
1,1,1,1,1,1,1,1, # 28 - 2f
|
||||
1,1,1,1,1,1,1,1, # 30 - 37
|
||||
1,1,1,1,1,1,1,1, # 38 - 3f
|
||||
1,1,1,1,1,1,1,1, # 40 - 47
|
||||
1,1,1,1,1,1,1,1, # 48 - 4f
|
||||
1,1,1,1,1,1,1,1, # 50 - 57
|
||||
1,1,1,1,1,1,1,1, # 58 - 5f
|
||||
1,1,1,1,1,1,1,1, # 60 - 67
|
||||
1,1,1,1,1,1,1,1, # 68 - 6f
|
||||
1,1,1,1,1,1,1,1, # 70 - 77
|
||||
1,1,1,1,1,1,1,1, # 78 - 7f
|
||||
2,2,2,2,3,3,3,3, # 80 - 87
|
||||
4,4,4,4,4,4,4,4, # 88 - 8f
|
||||
4,4,4,4,4,4,4,4, # 90 - 97
|
||||
4,4,4,4,4,4,4,4, # 98 - 9f
|
||||
5,5,5,5,5,5,5,5, # a0 - a7
|
||||
5,5,5,5,5,5,5,5, # a8 - af
|
||||
5,5,5,5,5,5,5,5, # b0 - b7
|
||||
5,5,5,5,5,5,5,5, # b8 - bf
|
||||
0,0,6,6,6,6,6,6, # c0 - c7
|
||||
6,6,6,6,6,6,6,6, # c8 - cf
|
||||
6,6,6,6,6,6,6,6, # d0 - d7
|
||||
6,6,6,6,6,6,6,6, # d8 - df
|
||||
7,8,8,8,8,8,8,8, # e0 - e7
|
||||
8,8,8,8,8,9,8,8, # e8 - ef
|
||||
10,11,11,11,11,11,11,11, # f0 - f7
|
||||
12,13,13,13,14,15,0,0 # f8 - ff
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as a legal value
|
||||
1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
|
||||
1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 40 - 47
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 48 - 4f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 50 - 57
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 58 - 5f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 60 - 67
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 68 - 6f
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 70 - 77
|
||||
1, 1, 1, 1, 1, 1, 1, 1, # 78 - 7f
|
||||
2, 2, 2, 2, 3, 3, 3, 3, # 80 - 87
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 88 - 8f
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 90 - 97
|
||||
4, 4, 4, 4, 4, 4, 4, 4, # 98 - 9f
|
||||
5, 5, 5, 5, 5, 5, 5, 5, # a0 - a7
|
||||
5, 5, 5, 5, 5, 5, 5, 5, # a8 - af
|
||||
5, 5, 5, 5, 5, 5, 5, 5, # b0 - b7
|
||||
5, 5, 5, 5, 5, 5, 5, 5, # b8 - bf
|
||||
0, 0, 6, 6, 6, 6, 6, 6, # c0 - c7
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # c8 - cf
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # d0 - d7
|
||||
6, 6, 6, 6, 6, 6, 6, 6, # d8 - df
|
||||
7, 8, 8, 8, 8, 8, 8, 8, # e0 - e7
|
||||
8, 8, 8, 8, 8, 9, 8, 8, # e8 - ef
|
||||
10, 11, 11, 11, 11, 11, 11, 11, # f0 - f7
|
||||
12, 13, 13, 13, 14, 15, 0, 0 # f8 - ff
|
||||
)
|
||||
|
||||
UTF8_ST = (
|
||||
@@ -562,11 +648,14 @@ UTF8_ST = (
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,#c0-c7
|
||||
MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR #c8-cf
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
||||
|
||||
UTF8_SM_MODEL = {'class_table': UTF8_CLS,
|
||||
'class_factor': 16,
|
||||
'state_table': UTF8_ST,
|
||||
'char_len_table': UTF8_CHAR_LEN_TABLE,
|
||||
'name': 'UTF-8'}
|
||||
UTF8_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": UTF8_CLS,
|
||||
"class_factor": 16,
|
||||
"state_table": UTF8_ST,
|
||||
"char_len_table": UTF8_CHAR_LEN_TABLE,
|
||||
"name": "UTF-8",
|
||||
}
|
||||
|
||||
@@ -1,19 +1,17 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Metadata about languages used by our model training code for our
|
||||
SingleByteCharSetProbers. Could be used for other things in the future.
|
||||
|
||||
This code is based on the language metadata from the uchardet project.
|
||||
"""
|
||||
from __future__ import absolute_import, print_function
|
||||
|
||||
from string import ascii_letters
|
||||
from typing import List, Optional
|
||||
|
||||
# TODO: Add Ukrainian (KOI8-U)
|
||||
|
||||
|
||||
# TODO: Add Ukranian (KOI8-U)
|
||||
|
||||
class Language(object):
|
||||
class Language:
|
||||
"""Metadata about a language useful for training models
|
||||
|
||||
:ivar name: The human name for the language, in English.
|
||||
@@ -33,9 +31,17 @@ class Language(object):
|
||||
Wikipedia for training data.
|
||||
:type wiki_start_pages: list of str
|
||||
"""
|
||||
def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
|
||||
alphabet=None, wiki_start_pages=None):
|
||||
super(Language, self).__init__()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: Optional[str] = None,
|
||||
iso_code: Optional[str] = None,
|
||||
use_ascii: bool = True,
|
||||
charsets: Optional[List[str]] = None,
|
||||
alphabet: Optional[str] = None,
|
||||
wiki_start_pages: Optional[List[str]] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.name = name
|
||||
self.iso_code = iso_code
|
||||
self.use_ascii = use_ascii
|
||||
@@ -46,265 +52,301 @@ class Language(object):
|
||||
else:
|
||||
alphabet = ascii_letters
|
||||
elif not alphabet:
|
||||
raise ValueError('Must supply alphabet if use_ascii is False')
|
||||
self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
|
||||
raise ValueError("Must supply alphabet if use_ascii is False")
|
||||
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
|
||||
self.wiki_start_pages = wiki_start_pages
|
||||
|
||||
def __repr__(self):
|
||||
return '{}({})'.format(self.__class__.__name__,
|
||||
', '.join('{}={!r}'.format(k, v)
|
||||
for k, v in self.__dict__.items()
|
||||
if not k.startswith('_')))
|
||||
def __repr__(self) -> str:
|
||||
param_str = ", ".join(
|
||||
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
||||
)
|
||||
return f"{self.__class__.__name__}({param_str})"
|
||||
|
||||
|
||||
LANGUAGES = {'Arabic': Language(name='Arabic',
|
||||
iso_code='ar',
|
||||
use_ascii=False,
|
||||
# We only support encodings that use isolated
|
||||
# forms, because the current recommendation is
|
||||
# that the rendering system handles presentation
|
||||
# forms. This means we purposefully skip IBM864.
|
||||
charsets=['ISO-8859-6', 'WINDOWS-1256',
|
||||
'CP720', 'CP864'],
|
||||
alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
|
||||
wiki_start_pages=[u'الصفحة_الرئيسية']),
|
||||
'Belarusian': Language(name='Belarusian',
|
||||
iso_code='be',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'IBM866', 'MacCyrillic'],
|
||||
alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
|
||||
u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
|
||||
wiki_start_pages=[u'Галоўная_старонка']),
|
||||
'Bulgarian': Language(name='Bulgarian',
|
||||
iso_code='bg',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'IBM855'],
|
||||
alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
|
||||
u'абвгдежзийклмнопрстуфхцчшщъьюя'),
|
||||
wiki_start_pages=[u'Начална_страница']),
|
||||
'Czech': Language(name='Czech',
|
||||
iso_code='cz',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
|
||||
wiki_start_pages=[u'Hlavní_strana']),
|
||||
'Danish': Language(name='Danish',
|
||||
iso_code='da',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'æøåÆØÅ',
|
||||
wiki_start_pages=[u'Forside']),
|
||||
'German': Language(name='German',
|
||||
iso_code='de',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||
alphabet=u'äöüßÄÖÜ',
|
||||
wiki_start_pages=[u'Wikipedia:Hauptseite']),
|
||||
'Greek': Language(name='Greek',
|
||||
iso_code='el',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-7', 'WINDOWS-1253'],
|
||||
alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
|
||||
u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
|
||||
wiki_start_pages=[u'Πύλη:Κύρια']),
|
||||
'English': Language(name='English',
|
||||
iso_code='en',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||
wiki_start_pages=[u'Main_Page']),
|
||||
'Esperanto': Language(name='Esperanto',
|
||||
iso_code='eo',
|
||||
# Q, W, X, and Y not used at all
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-3'],
|
||||
alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
|
||||
u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
|
||||
wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
|
||||
'Spanish': Language(name='Spanish',
|
||||
iso_code='es',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
|
||||
wiki_start_pages=[u'Wikipedia:Portada']),
|
||||
'Estonian': Language(name='Estonian',
|
||||
iso_code='et',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-4', 'ISO-8859-13',
|
||||
'WINDOWS-1257'],
|
||||
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
||||
# loanwords
|
||||
alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
|
||||
u'abdeghijklmnoprstuvõäöü'),
|
||||
wiki_start_pages=[u'Esileht']),
|
||||
'Finnish': Language(name='Finnish',
|
||||
iso_code='fi',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ÅÄÖŠŽåäöšž',
|
||||
wiki_start_pages=[u'Wikipedia:Etusivu']),
|
||||
'French': Language(name='French',
|
||||
iso_code='fr',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
|
||||
wiki_start_pages=[u'Wikipédia:Accueil_principal',
|
||||
u'Bœuf (animal)']),
|
||||
'Hebrew': Language(name='Hebrew',
|
||||
iso_code='he',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-8', 'WINDOWS-1255'],
|
||||
alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
|
||||
wiki_start_pages=[u'עמוד_ראשי']),
|
||||
'Croatian': Language(name='Croatian',
|
||||
iso_code='hr',
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
|
||||
u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
|
||||
wiki_start_pages=[u'Glavna_stranica']),
|
||||
'Hungarian': Language(name='Hungarian',
|
||||
iso_code='hu',
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
|
||||
u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
|
||||
wiki_start_pages=[u'Kezdőlap']),
|
||||
'Italian': Language(name='Italian',
|
||||
iso_code='it',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
|
||||
wiki_start_pages=[u'Pagina_principale']),
|
||||
'Lithuanian': Language(name='Lithuanian',
|
||||
iso_code='lt',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
||||
'ISO-8859-4'],
|
||||
# Q, W, and X not used at all
|
||||
alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
|
||||
u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
|
||||
wiki_start_pages=[u'Pagrindinis_puslapis']),
|
||||
'Latvian': Language(name='Latvian',
|
||||
iso_code='lv',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
||||
'ISO-8859-4'],
|
||||
# Q, W, X, Y are only for loanwords
|
||||
alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
|
||||
u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
|
||||
wiki_start_pages=[u'Sākumlapa']),
|
||||
'Macedonian': Language(name='Macedonian',
|
||||
iso_code='mk',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'MacCyrillic', 'IBM855'],
|
||||
alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
|
||||
u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
|
||||
wiki_start_pages=[u'Главна_страница']),
|
||||
'Dutch': Language(name='Dutch',
|
||||
iso_code='nl',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||
wiki_start_pages=[u'Hoofdpagina']),
|
||||
'Polish': Language(name='Polish',
|
||||
iso_code='pl',
|
||||
# Q and X are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
|
||||
u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
|
||||
wiki_start_pages=[u'Wikipedia:Strona_główna']),
|
||||
'Portuguese': Language(name='Portuguese',
|
||||
iso_code='pt',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
|
||||
wiki_start_pages=[u'Wikipédia:Página_principal']),
|
||||
'Romanian': Language(name='Romanian',
|
||||
iso_code='ro',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=u'ăâîșțĂÂÎȘȚ',
|
||||
wiki_start_pages=[u'Pagina_principală']),
|
||||
'Russian': Language(name='Russian',
|
||||
iso_code='ru',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'KOI8-R', 'MacCyrillic', 'IBM866',
|
||||
'IBM855'],
|
||||
alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
|
||||
u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
|
||||
wiki_start_pages=[u'Заглавная_страница']),
|
||||
'Slovak': Language(name='Slovak',
|
||||
iso_code='sk',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
|
||||
wiki_start_pages=[u'Hlavná_stránka']),
|
||||
'Slovene': Language(name='Slovene',
|
||||
iso_code='sl',
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'abcčdefghijklmnoprsštuvzž'
|
||||
u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
|
||||
wiki_start_pages=[u'Glavna_stran']),
|
||||
# Serbian can be written in both Latin and Cyrillic, but there's no
|
||||
# simple way to get the Latin alphabet pages from Wikipedia through
|
||||
# the API, so for now we just support Cyrillic.
|
||||
'Serbian': Language(name='Serbian',
|
||||
iso_code='sr',
|
||||
alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
|
||||
u'абвгдђежзијклљмнњопрстћуфхцчџш'),
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'MacCyrillic', 'IBM855'],
|
||||
wiki_start_pages=[u'Главна_страна']),
|
||||
'Thai': Language(name='Thai',
|
||||
iso_code='th',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
|
||||
alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
|
||||
wiki_start_pages=[u'หน้าหลัก']),
|
||||
'Turkish': Language(name='Turkish',
|
||||
iso_code='tr',
|
||||
# Q, W, and X are not used by Turkish
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-3', 'ISO-8859-9',
|
||||
'WINDOWS-1254'],
|
||||
alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
|
||||
u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
|
||||
wiki_start_pages=[u'Ana_Sayfa']),
|
||||
'Vietnamese': Language(name='Vietnamese',
|
||||
iso_code='vi',
|
||||
use_ascii=False,
|
||||
# Windows-1258 is the only common 8-bit
|
||||
# Vietnamese encoding supported by Python.
|
||||
# From Wikipedia:
|
||||
# For systems that lack support for Unicode,
|
||||
# dozens of 8-bit Vietnamese code pages are
|
||||
# available.[1] The most common are VISCII
|
||||
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
|
||||
# Where ASCII is required, such as when
|
||||
# ensuring readability in plain text e-mail,
|
||||
# Vietnamese letters are often encoded
|
||||
# according to Vietnamese Quoted-Readable
|
||||
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
|
||||
# though usage of either variable-width
|
||||
# scheme has declined dramatically following
|
||||
# the adoption of Unicode on the World Wide
|
||||
# Web.
|
||||
charsets=['WINDOWS-1258'],
|
||||
alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
|
||||
u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
|
||||
wiki_start_pages=[u'Chữ_Quốc_ngữ']),
|
||||
}
|
||||
LANGUAGES = {
|
||||
"Arabic": Language(
|
||||
name="Arabic",
|
||||
iso_code="ar",
|
||||
use_ascii=False,
|
||||
# We only support encodings that use isolated
|
||||
# forms, because the current recommendation is
|
||||
# that the rendering system handles presentation
|
||||
# forms. This means we purposefully skip IBM864.
|
||||
charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
|
||||
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
|
||||
wiki_start_pages=["الصفحة_الرئيسية"],
|
||||
),
|
||||
"Belarusian": Language(
|
||||
name="Belarusian",
|
||||
iso_code="be",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
|
||||
alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
|
||||
wiki_start_pages=["Галоўная_старонка"],
|
||||
),
|
||||
"Bulgarian": Language(
|
||||
name="Bulgarian",
|
||||
iso_code="bg",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
|
||||
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
|
||||
wiki_start_pages=["Начална_страница"],
|
||||
),
|
||||
"Czech": Language(
|
||||
name="Czech",
|
||||
iso_code="cz",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
|
||||
wiki_start_pages=["Hlavní_strana"],
|
||||
),
|
||||
"Danish": Language(
|
||||
name="Danish",
|
||||
iso_code="da",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="æøåÆØÅ",
|
||||
wiki_start_pages=["Forside"],
|
||||
),
|
||||
"German": Language(
|
||||
name="German",
|
||||
iso_code="de",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="äöüßẞÄÖÜ",
|
||||
wiki_start_pages=["Wikipedia:Hauptseite"],
|
||||
),
|
||||
"Greek": Language(
|
||||
name="Greek",
|
||||
iso_code="el",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-7", "WINDOWS-1253"],
|
||||
alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
|
||||
wiki_start_pages=["Πύλη:Κύρια"],
|
||||
),
|
||||
"English": Language(
|
||||
name="English",
|
||||
iso_code="en",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||
wiki_start_pages=["Main_Page"],
|
||||
),
|
||||
"Esperanto": Language(
|
||||
name="Esperanto",
|
||||
iso_code="eo",
|
||||
# Q, W, X, and Y not used at all
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-3"],
|
||||
alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
|
||||
wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
|
||||
),
|
||||
"Spanish": Language(
|
||||
name="Spanish",
|
||||
iso_code="es",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
|
||||
wiki_start_pages=["Wikipedia:Portada"],
|
||||
),
|
||||
"Estonian": Language(
|
||||
name="Estonian",
|
||||
iso_code="et",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
|
||||
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
||||
# loanwords
|
||||
alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
|
||||
wiki_start_pages=["Esileht"],
|
||||
),
|
||||
"Finnish": Language(
|
||||
name="Finnish",
|
||||
iso_code="fi",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÅÄÖŠŽåäöšž",
|
||||
wiki_start_pages=["Wikipedia:Etusivu"],
|
||||
),
|
||||
"French": Language(
|
||||
name="French",
|
||||
iso_code="fr",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
|
||||
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
|
||||
),
|
||||
"Hebrew": Language(
|
||||
name="Hebrew",
|
||||
iso_code="he",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-8", "WINDOWS-1255"],
|
||||
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
|
||||
wiki_start_pages=["עמוד_ראשי"],
|
||||
),
|
||||
"Croatian": Language(
|
||||
name="Croatian",
|
||||
iso_code="hr",
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
|
||||
wiki_start_pages=["Glavna_stranica"],
|
||||
),
|
||||
"Hungarian": Language(
|
||||
name="Hungarian",
|
||||
iso_code="hu",
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
|
||||
wiki_start_pages=["Kezdőlap"],
|
||||
),
|
||||
"Italian": Language(
|
||||
name="Italian",
|
||||
iso_code="it",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
|
||||
wiki_start_pages=["Pagina_principale"],
|
||||
),
|
||||
"Lithuanian": Language(
|
||||
name="Lithuanian",
|
||||
iso_code="lt",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||||
# Q, W, and X not used at all
|
||||
alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
|
||||
wiki_start_pages=["Pagrindinis_puslapis"],
|
||||
),
|
||||
"Latvian": Language(
|
||||
name="Latvian",
|
||||
iso_code="lv",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||||
# Q, W, X, Y are only for loanwords
|
||||
alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
|
||||
wiki_start_pages=["Sākumlapa"],
|
||||
),
|
||||
"Macedonian": Language(
|
||||
name="Macedonian",
|
||||
iso_code="mk",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||||
alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
|
||||
wiki_start_pages=["Главна_страница"],
|
||||
),
|
||||
"Dutch": Language(
|
||||
name="Dutch",
|
||||
iso_code="nl",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||
wiki_start_pages=["Hoofdpagina"],
|
||||
),
|
||||
"Polish": Language(
|
||||
name="Polish",
|
||||
iso_code="pl",
|
||||
# Q and X are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
|
||||
wiki_start_pages=["Wikipedia:Strona_główna"],
|
||||
),
|
||||
"Portuguese": Language(
|
||||
name="Portuguese",
|
||||
iso_code="pt",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
|
||||
wiki_start_pages=["Wikipédia:Página_principal"],
|
||||
),
|
||||
"Romanian": Language(
|
||||
name="Romanian",
|
||||
iso_code="ro",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="ăâîșțĂÂÎȘȚ",
|
||||
wiki_start_pages=["Pagina_principală"],
|
||||
),
|
||||
"Russian": Language(
|
||||
name="Russian",
|
||||
iso_code="ru",
|
||||
use_ascii=False,
|
||||
charsets=[
|
||||
"ISO-8859-5",
|
||||
"WINDOWS-1251",
|
||||
"KOI8-R",
|
||||
"MacCyrillic",
|
||||
"IBM866",
|
||||
"IBM855",
|
||||
],
|
||||
alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
|
||||
wiki_start_pages=["Заглавная_страница"],
|
||||
),
|
||||
"Slovak": Language(
|
||||
name="Slovak",
|
||||
iso_code="sk",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
|
||||
wiki_start_pages=["Hlavná_stránka"],
|
||||
),
|
||||
"Slovene": Language(
|
||||
name="Slovene",
|
||||
iso_code="sl",
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
|
||||
wiki_start_pages=["Glavna_stran"],
|
||||
),
|
||||
# Serbian can be written in both Latin and Cyrillic, but there's no
|
||||
# simple way to get the Latin alphabet pages from Wikipedia through
|
||||
# the API, so for now we just support Cyrillic.
|
||||
"Serbian": Language(
|
||||
name="Serbian",
|
||||
iso_code="sr",
|
||||
alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||||
wiki_start_pages=["Главна_страна"],
|
||||
),
|
||||
"Thai": Language(
|
||||
name="Thai",
|
||||
iso_code="th",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-11", "TIS-620", "CP874"],
|
||||
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
|
||||
wiki_start_pages=["หน้าหลัก"],
|
||||
),
|
||||
"Turkish": Language(
|
||||
name="Turkish",
|
||||
iso_code="tr",
|
||||
# Q, W, and X are not used by Turkish
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
|
||||
alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
|
||||
wiki_start_pages=["Ana_Sayfa"],
|
||||
),
|
||||
"Vietnamese": Language(
|
||||
name="Vietnamese",
|
||||
iso_code="vi",
|
||||
use_ascii=False,
|
||||
# Windows-1258 is the only common 8-bit
|
||||
# Vietnamese encoding supported by Python.
|
||||
# From Wikipedia:
|
||||
# For systems that lack support for Unicode,
|
||||
# dozens of 8-bit Vietnamese code pages are
|
||||
# available.[1] The most common are VISCII
|
||||
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
|
||||
# Where ASCII is required, such as when
|
||||
# ensuring readability in plain text e-mail,
|
||||
# Vietnamese letters are often encoded
|
||||
# according to Vietnamese Quoted-Readable
|
||||
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
|
||||
# though usage of either variable-width
|
||||
# scheme has declined dramatically following
|
||||
# the adoption of Unicode on the World Wide
|
||||
# Web.
|
||||
charsets=["WINDOWS-1258"],
|
||||
alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
|
||||
wiki_start_pages=["Chữ_Quốc_ngữ"],
|
||||
),
|
||||
}
|
||||
|
||||
@@ -26,70 +26,77 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from collections import namedtuple
|
||||
from typing import Dict, List, NamedTuple, Optional, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
||||
|
||||
|
||||
SingleByteCharSetModel = namedtuple('SingleByteCharSetModel',
|
||||
['charset_name',
|
||||
'language',
|
||||
'char_to_order_map',
|
||||
'language_model',
|
||||
'typical_positive_ratio',
|
||||
'keep_ascii_letters',
|
||||
'alphabet'])
|
||||
class SingleByteCharSetModel(NamedTuple):
|
||||
charset_name: str
|
||||
language: str
|
||||
char_to_order_map: Dict[int, int]
|
||||
language_model: Dict[int, Dict[int, int]]
|
||||
typical_positive_ratio: float
|
||||
keep_ascii_letters: bool
|
||||
alphabet: str
|
||||
|
||||
|
||||
class SingleByteCharSetProber(CharSetProber):
|
||||
SAMPLE_SIZE = 64
|
||||
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
|
||||
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
|
||||
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
||||
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
||||
|
||||
def __init__(self, model, reversed=False, name_prober=None):
|
||||
super(SingleByteCharSetProber, self).__init__()
|
||||
def __init__(
|
||||
self,
|
||||
model: SingleByteCharSetModel,
|
||||
is_reversed: bool = False,
|
||||
name_prober: Optional[CharSetProber] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self._model = model
|
||||
# TRUE if we need to reverse every pair in the model lookup
|
||||
self._reversed = reversed
|
||||
self._reversed = is_reversed
|
||||
# Optional auxiliary prober for name decision
|
||||
self._name_prober = name_prober
|
||||
self._last_order = None
|
||||
self._seq_counters = None
|
||||
self._total_seqs = None
|
||||
self._total_char = None
|
||||
self._freq_char = None
|
||||
self._last_order = 255
|
||||
self._seq_counters: List[int] = []
|
||||
self._total_seqs = 0
|
||||
self._total_char = 0
|
||||
self._control_char = 0
|
||||
self._freq_char = 0
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
super(SingleByteCharSetProber, self).reset()
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
# char order of last character
|
||||
self._last_order = 255
|
||||
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
|
||||
self._total_seqs = 0
|
||||
self._total_char = 0
|
||||
self._control_char = 0
|
||||
# characters that fall in our sampling range
|
||||
self._freq_char = 0
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> Optional[str]:
|
||||
if self._name_prober:
|
||||
return self._name_prober.charset_name
|
||||
else:
|
||||
return self._model.charset_name
|
||||
return self._model.charset_name
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> Optional[str]:
|
||||
if self._name_prober:
|
||||
return self._name_prober.language
|
||||
else:
|
||||
return self._model.language
|
||||
return self._model.language
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
# TODO: Make filter_international_words keep things in self.alphabet
|
||||
if not self._model.keep_ascii_letters:
|
||||
byte_str = self.filter_international_words(byte_str)
|
||||
else:
|
||||
byte_str = self.remove_xml_tags(byte_str)
|
||||
if not byte_str:
|
||||
return self.state
|
||||
char_to_order_map = self._model.char_to_order_map
|
||||
@@ -103,9 +110,6 @@ class SingleByteCharSetProber(CharSetProber):
|
||||
# _total_char purposes.
|
||||
if order < CharacterCategory.CONTROL:
|
||||
self._total_char += 1
|
||||
# TODO: Follow uchardet's lead and discount confidence for frequent
|
||||
# control characters.
|
||||
# See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
|
||||
if order < self.SAMPLE_SIZE:
|
||||
self._freq_char += 1
|
||||
if self._last_order < self.SAMPLE_SIZE:
|
||||
@@ -122,23 +126,36 @@ class SingleByteCharSetProber(CharSetProber):
|
||||
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
|
||||
confidence = self.get_confidence()
|
||||
if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
|
||||
self.logger.debug('%s confidence = %s, we have a winner',
|
||||
charset_name, confidence)
|
||||
self.logger.debug(
|
||||
"%s confidence = %s, we have a winner", charset_name, confidence
|
||||
)
|
||||
self._state = ProbingState.FOUND_IT
|
||||
elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
|
||||
self.logger.debug('%s confidence = %s, below negative '
|
||||
'shortcut threshhold %s', charset_name,
|
||||
confidence,
|
||||
self.NEGATIVE_SHORTCUT_THRESHOLD)
|
||||
self.logger.debug(
|
||||
"%s confidence = %s, below negative shortcut threshold %s",
|
||||
charset_name,
|
||||
confidence,
|
||||
self.NEGATIVE_SHORTCUT_THRESHOLD,
|
||||
)
|
||||
self._state = ProbingState.NOT_ME
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
r = 0.01
|
||||
if self._total_seqs > 0:
|
||||
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
|
||||
self._total_seqs / self._model.typical_positive_ratio)
|
||||
r = (
|
||||
(
|
||||
self._seq_counters[SequenceLikelihood.POSITIVE]
|
||||
+ 0.25 * self._seq_counters[SequenceLikelihood.LIKELY]
|
||||
)
|
||||
/ self._total_seqs
|
||||
/ self._model.typical_positive_ratio
|
||||
)
|
||||
# The more control characters (proportionnaly to the size
|
||||
# of the text), the less confident we become in the current
|
||||
# charset.
|
||||
r = r * (self._total_char - self._control_char) / self._total_char
|
||||
r = r * self._freq_char / self._total_char
|
||||
if r >= 1.0:
|
||||
r = 0.99
|
||||
|
||||
@@ -28,33 +28,38 @@
|
||||
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .hebrewprober import HebrewProber
|
||||
from .langbulgarianmodel import (ISO_8859_5_BULGARIAN_MODEL,
|
||||
WINDOWS_1251_BULGARIAN_MODEL)
|
||||
from .langbulgarianmodel import ISO_8859_5_BULGARIAN_MODEL, WINDOWS_1251_BULGARIAN_MODEL
|
||||
from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
|
||||
from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
|
||||
|
||||
# from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
|
||||
# WINDOWS_1250_HUNGARIAN_MODEL)
|
||||
from .langrussianmodel import (IBM855_RUSSIAN_MODEL, IBM866_RUSSIAN_MODEL,
|
||||
ISO_8859_5_RUSSIAN_MODEL, KOI8_R_RUSSIAN_MODEL,
|
||||
MACCYRILLIC_RUSSIAN_MODEL,
|
||||
WINDOWS_1251_RUSSIAN_MODEL)
|
||||
from .langrussianmodel import (
|
||||
IBM855_RUSSIAN_MODEL,
|
||||
IBM866_RUSSIAN_MODEL,
|
||||
ISO_8859_5_RUSSIAN_MODEL,
|
||||
KOI8_R_RUSSIAN_MODEL,
|
||||
MACCYRILLIC_RUSSIAN_MODEL,
|
||||
WINDOWS_1251_RUSSIAN_MODEL,
|
||||
)
|
||||
from .langthaimodel import TIS_620_THAI_MODEL
|
||||
from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
|
||||
from .sbcharsetprober import SingleByteCharSetProber
|
||||
|
||||
|
||||
class SBCSGroupProber(CharSetGroupProber):
|
||||
def __init__(self):
|
||||
super(SBCSGroupProber, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
hebrew_prober = HebrewProber()
|
||||
logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
|
||||
False, hebrew_prober)
|
||||
logical_hebrew_prober = SingleByteCharSetProber(
|
||||
WINDOWS_1255_HEBREW_MODEL, is_reversed=False, name_prober=hebrew_prober
|
||||
)
|
||||
# TODO: See if using ISO-8859-8 Hebrew model works better here, since
|
||||
# it's actually the visual one
|
||||
visual_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
|
||||
True, hebrew_prober)
|
||||
hebrew_prober.set_model_probers(logical_hebrew_prober,
|
||||
visual_hebrew_prober)
|
||||
visual_hebrew_prober = SingleByteCharSetProber(
|
||||
WINDOWS_1255_HEBREW_MODEL, is_reversed=True, name_prober=hebrew_prober
|
||||
)
|
||||
hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
|
||||
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
|
||||
# and several tests failed that did not before. Some thought
|
||||
# should be put into the ordering, and we should consider making
|
||||
|
||||
@@ -25,68 +25,81 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from typing import Union
|
||||
|
||||
from .chardistribution import SJISDistributionAnalysis
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import MachineState, ProbingState
|
||||
from .jpcntx import SJISContextAnalysis
|
||||
from .mbcharsetprober import MultiByteCharSetProber
|
||||
from .mbcssm import SJIS_SM_MODEL
|
||||
from .enums import ProbingState, MachineState
|
||||
|
||||
|
||||
class SJISProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
super(SJISProber, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
|
||||
self.distribution_analyzer = SJISDistributionAnalysis()
|
||||
self.context_analyzer = SJISContextAnalysis()
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
super(SJISProber, self).reset()
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self.context_analyzer.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return self.context_analyzer.charset_name
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Japanese"
|
||||
|
||||
def feed(self, byte_str):
|
||||
for i in range(len(byte_str)):
|
||||
coding_state = self.coding_sm.next_state(byte_str[i])
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
assert self.coding_sm is not None
|
||||
assert self.distribution_analyzer is not None
|
||||
|
||||
for i, byte in enumerate(byte_str):
|
||||
coding_state = self.coding_sm.next_state(byte)
|
||||
if coding_state == MachineState.ERROR:
|
||||
self.logger.debug('%s %s prober hit error at byte %s',
|
||||
self.charset_name, self.language, i)
|
||||
self.logger.debug(
|
||||
"%s %s prober hit error at byte %s",
|
||||
self.charset_name,
|
||||
self.language,
|
||||
i,
|
||||
)
|
||||
self._state = ProbingState.NOT_ME
|
||||
break
|
||||
elif coding_state == MachineState.ITS_ME:
|
||||
if coding_state == MachineState.ITS_ME:
|
||||
self._state = ProbingState.FOUND_IT
|
||||
break
|
||||
elif coding_state == MachineState.START:
|
||||
if coding_state == MachineState.START:
|
||||
char_len = self.coding_sm.get_current_charlen()
|
||||
if i == 0:
|
||||
self._last_char[1] = byte_str[0]
|
||||
self.context_analyzer.feed(self._last_char[2 - char_len:],
|
||||
char_len)
|
||||
self._last_char[1] = byte
|
||||
self.context_analyzer.feed(
|
||||
self._last_char[2 - char_len :], char_len
|
||||
)
|
||||
self.distribution_analyzer.feed(self._last_char, char_len)
|
||||
else:
|
||||
self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3
|
||||
- char_len], char_len)
|
||||
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
|
||||
char_len)
|
||||
self.context_analyzer.feed(
|
||||
byte_str[i + 1 - char_len : i + 3 - char_len], char_len
|
||||
)
|
||||
self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
|
||||
|
||||
self._last_char[0] = byte_str[-1]
|
||||
|
||||
if self.state == ProbingState.DETECTING:
|
||||
if (self.context_analyzer.got_enough_data() and
|
||||
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
|
||||
if self.context_analyzer.got_enough_data() and (
|
||||
self.get_confidence() > self.SHORTCUT_THRESHOLD
|
||||
):
|
||||
self._state = ProbingState.FOUND_IT
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
assert self.distribution_analyzer is not None
|
||||
|
||||
context_conf = self.context_analyzer.get_confidence()
|
||||
distrib_conf = self.distribution_analyzer.get_confidence()
|
||||
return max(context_conf, distrib_conf)
|
||||
|
||||
@@ -39,16 +39,21 @@ class a user of ``chardet`` should use.
|
||||
import codecs
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import InputState, LanguageFilter, ProbingState
|
||||
from .escprober import EscCharSetProber
|
||||
from .latin1prober import Latin1Prober
|
||||
from .macromanprober import MacRomanProber
|
||||
from .mbcsgroupprober import MBCSGroupProber
|
||||
from .resultdict import ResultDict
|
||||
from .sbcsgroupprober import SBCSGroupProber
|
||||
from .utf1632prober import UTF1632Prober
|
||||
|
||||
|
||||
class UniversalDetector(object):
|
||||
class UniversalDetector:
|
||||
"""
|
||||
The ``UniversalDetector`` class underlies the ``chardet.detect`` function
|
||||
and coordinates all of the different charset probers.
|
||||
@@ -66,49 +71,87 @@ class UniversalDetector(object):
|
||||
"""
|
||||
|
||||
MINIMUM_THRESHOLD = 0.20
|
||||
HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]')
|
||||
ESC_DETECTOR = re.compile(b'(\033|~{)')
|
||||
WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]')
|
||||
ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252',
|
||||
'iso-8859-2': 'Windows-1250',
|
||||
'iso-8859-5': 'Windows-1251',
|
||||
'iso-8859-6': 'Windows-1256',
|
||||
'iso-8859-7': 'Windows-1253',
|
||||
'iso-8859-8': 'Windows-1255',
|
||||
'iso-8859-9': 'Windows-1254',
|
||||
'iso-8859-13': 'Windows-1257'}
|
||||
HIGH_BYTE_DETECTOR = re.compile(b"[\x80-\xFF]")
|
||||
ESC_DETECTOR = re.compile(b"(\033|~{)")
|
||||
WIN_BYTE_DETECTOR = re.compile(b"[\x80-\x9F]")
|
||||
ISO_WIN_MAP = {
|
||||
"iso-8859-1": "Windows-1252",
|
||||
"iso-8859-2": "Windows-1250",
|
||||
"iso-8859-5": "Windows-1251",
|
||||
"iso-8859-6": "Windows-1256",
|
||||
"iso-8859-7": "Windows-1253",
|
||||
"iso-8859-8": "Windows-1255",
|
||||
"iso-8859-9": "Windows-1254",
|
||||
"iso-8859-13": "Windows-1257",
|
||||
}
|
||||
# Based on https://encoding.spec.whatwg.org/#names-and-labels
|
||||
# but altered to match Python names for encodings and remove mappings
|
||||
# that break tests.
|
||||
LEGACY_MAP = {
|
||||
"ascii": "Windows-1252",
|
||||
"iso-8859-1": "Windows-1252",
|
||||
"tis-620": "ISO-8859-11",
|
||||
"iso-8859-9": "Windows-1254",
|
||||
"gb2312": "GB18030",
|
||||
"euc-kr": "CP949",
|
||||
"utf-16le": "UTF-16",
|
||||
}
|
||||
|
||||
def __init__(self, lang_filter=LanguageFilter.ALL):
|
||||
self._esc_charset_prober = None
|
||||
self._charset_probers = []
|
||||
self.result = None
|
||||
self.done = None
|
||||
self._got_data = None
|
||||
self._input_state = None
|
||||
self._last_char = None
|
||||
def __init__(
|
||||
self,
|
||||
lang_filter: LanguageFilter = LanguageFilter.ALL,
|
||||
should_rename_legacy: bool = False,
|
||||
) -> None:
|
||||
self._esc_charset_prober: Optional[EscCharSetProber] = None
|
||||
self._utf1632_prober: Optional[UTF1632Prober] = None
|
||||
self._charset_probers: List[CharSetProber] = []
|
||||
self.result: ResultDict = {
|
||||
"encoding": None,
|
||||
"confidence": 0.0,
|
||||
"language": None,
|
||||
}
|
||||
self.done = False
|
||||
self._got_data = False
|
||||
self._input_state = InputState.PURE_ASCII
|
||||
self._last_char = b""
|
||||
self.lang_filter = lang_filter
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._has_win_bytes = None
|
||||
self._has_win_bytes = False
|
||||
self.should_rename_legacy = should_rename_legacy
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
@property
|
||||
def input_state(self) -> int:
|
||||
return self._input_state
|
||||
|
||||
@property
|
||||
def has_win_bytes(self) -> bool:
|
||||
return self._has_win_bytes
|
||||
|
||||
@property
|
||||
def charset_probers(self) -> List[CharSetProber]:
|
||||
return self._charset_probers
|
||||
|
||||
def reset(self) -> None:
|
||||
"""
|
||||
Reset the UniversalDetector and all of its probers back to their
|
||||
initial states. This is called by ``__init__``, so you only need to
|
||||
call this directly in between analyses of different documents.
|
||||
"""
|
||||
self.result = {'encoding': None, 'confidence': 0.0, 'language': None}
|
||||
self.result = {"encoding": None, "confidence": 0.0, "language": None}
|
||||
self.done = False
|
||||
self._got_data = False
|
||||
self._has_win_bytes = False
|
||||
self._input_state = InputState.PURE_ASCII
|
||||
self._last_char = b''
|
||||
self._last_char = b""
|
||||
if self._esc_charset_prober:
|
||||
self._esc_charset_prober.reset()
|
||||
if self._utf1632_prober:
|
||||
self._utf1632_prober.reset()
|
||||
for prober in self._charset_probers:
|
||||
prober.reset()
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> None:
|
||||
"""
|
||||
Takes a chunk of a document and feeds it through all of the relevant
|
||||
charset probers.
|
||||
@@ -125,7 +168,7 @@ class UniversalDetector(object):
|
||||
if self.done:
|
||||
return
|
||||
|
||||
if not len(byte_str):
|
||||
if not byte_str:
|
||||
return
|
||||
|
||||
if not isinstance(byte_str, bytearray):
|
||||
@@ -136,35 +179,38 @@ class UniversalDetector(object):
|
||||
# If the data starts with BOM, we know it is UTF
|
||||
if byte_str.startswith(codecs.BOM_UTF8):
|
||||
# EF BB BF UTF-8 with BOM
|
||||
self.result = {'encoding': "UTF-8-SIG",
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
elif byte_str.startswith((codecs.BOM_UTF32_LE,
|
||||
codecs.BOM_UTF32_BE)):
|
||||
self.result = {
|
||||
"encoding": "UTF-8-SIG",
|
||||
"confidence": 1.0,
|
||||
"language": "",
|
||||
}
|
||||
elif byte_str.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
|
||||
# FF FE 00 00 UTF-32, little-endian BOM
|
||||
# 00 00 FE FF UTF-32, big-endian BOM
|
||||
self.result = {'encoding': "UTF-32",
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
elif byte_str.startswith(b'\xFE\xFF\x00\x00'):
|
||||
self.result = {"encoding": "UTF-32", "confidence": 1.0, "language": ""}
|
||||
elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
|
||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||
self.result = {'encoding': "X-ISO-10646-UCS-4-3412",
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
elif byte_str.startswith(b'\x00\x00\xFF\xFE'):
|
||||
self.result = {
|
||||
# TODO: This encoding is not supported by Python. Should remove?
|
||||
"encoding": "X-ISO-10646-UCS-4-3412",
|
||||
"confidence": 1.0,
|
||||
"language": "",
|
||||
}
|
||||
elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
|
||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||
self.result = {'encoding': "X-ISO-10646-UCS-4-2143",
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
self.result = {
|
||||
# TODO: This encoding is not supported by Python. Should remove?
|
||||
"encoding": "X-ISO-10646-UCS-4-2143",
|
||||
"confidence": 1.0,
|
||||
"language": "",
|
||||
}
|
||||
elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)):
|
||||
# FF FE UTF-16, little endian BOM
|
||||
# FE FF UTF-16, big endian BOM
|
||||
self.result = {'encoding': "UTF-16",
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
self.result = {"encoding": "UTF-16", "confidence": 1.0, "language": ""}
|
||||
|
||||
self._got_data = True
|
||||
if self.result['encoding'] is not None:
|
||||
if self.result["encoding"] is not None:
|
||||
self.done = True
|
||||
return
|
||||
|
||||
@@ -173,12 +219,29 @@ class UniversalDetector(object):
|
||||
if self._input_state == InputState.PURE_ASCII:
|
||||
if self.HIGH_BYTE_DETECTOR.search(byte_str):
|
||||
self._input_state = InputState.HIGH_BYTE
|
||||
elif self._input_state == InputState.PURE_ASCII and \
|
||||
self.ESC_DETECTOR.search(self._last_char + byte_str):
|
||||
elif (
|
||||
self._input_state == InputState.PURE_ASCII
|
||||
and self.ESC_DETECTOR.search(self._last_char + byte_str)
|
||||
):
|
||||
self._input_state = InputState.ESC_ASCII
|
||||
|
||||
self._last_char = byte_str[-1:]
|
||||
|
||||
# next we will look to see if it is appears to be either a UTF-16 or
|
||||
# UTF-32 encoding
|
||||
if not self._utf1632_prober:
|
||||
self._utf1632_prober = UTF1632Prober()
|
||||
|
||||
if self._utf1632_prober.state == ProbingState.DETECTING:
|
||||
if self._utf1632_prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||
self.result = {
|
||||
"encoding": self._utf1632_prober.charset_name,
|
||||
"confidence": self._utf1632_prober.get_confidence(),
|
||||
"language": "",
|
||||
}
|
||||
self.done = True
|
||||
return
|
||||
|
||||
# If we've seen escape sequences, use the EscCharSetProber, which
|
||||
# uses a simple state machine to check for known escape sequences in
|
||||
# HZ and ISO-2022 encodings, since those are the only encodings that
|
||||
@@ -187,12 +250,11 @@ class UniversalDetector(object):
|
||||
if not self._esc_charset_prober:
|
||||
self._esc_charset_prober = EscCharSetProber(self.lang_filter)
|
||||
if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||
self.result = {'encoding':
|
||||
self._esc_charset_prober.charset_name,
|
||||
'confidence':
|
||||
self._esc_charset_prober.get_confidence(),
|
||||
'language':
|
||||
self._esc_charset_prober.language}
|
||||
self.result = {
|
||||
"encoding": self._esc_charset_prober.charset_name,
|
||||
"confidence": self._esc_charset_prober.get_confidence(),
|
||||
"language": self._esc_charset_prober.language,
|
||||
}
|
||||
self.done = True
|
||||
# If we've seen high bytes (i.e., those with values greater than 127),
|
||||
# we need to do more complicated checks using all our multi-byte and
|
||||
@@ -207,17 +269,20 @@ class UniversalDetector(object):
|
||||
if self.lang_filter & LanguageFilter.NON_CJK:
|
||||
self._charset_probers.append(SBCSGroupProber())
|
||||
self._charset_probers.append(Latin1Prober())
|
||||
self._charset_probers.append(MacRomanProber())
|
||||
for prober in self._charset_probers:
|
||||
if prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||
self.result = {'encoding': prober.charset_name,
|
||||
'confidence': prober.get_confidence(),
|
||||
'language': prober.language}
|
||||
self.result = {
|
||||
"encoding": prober.charset_name,
|
||||
"confidence": prober.get_confidence(),
|
||||
"language": prober.language,
|
||||
}
|
||||
self.done = True
|
||||
break
|
||||
if self.WIN_BYTE_DETECTOR.search(byte_str):
|
||||
self._has_win_bytes = True
|
||||
|
||||
def close(self):
|
||||
def close(self) -> ResultDict:
|
||||
"""
|
||||
Stop analyzing the current document and come up with a final
|
||||
prediction.
|
||||
@@ -231,13 +296,11 @@ class UniversalDetector(object):
|
||||
self.done = True
|
||||
|
||||
if not self._got_data:
|
||||
self.logger.debug('no data received!')
|
||||
self.logger.debug("no data received!")
|
||||
|
||||
# Default to ASCII if it is all we've seen so far
|
||||
elif self._input_state == InputState.PURE_ASCII:
|
||||
self.result = {'encoding': 'ascii',
|
||||
'confidence': 1.0,
|
||||
'language': ''}
|
||||
self.result = {"encoding": "ascii", "confidence": 1.0, "language": ""}
|
||||
|
||||
# If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD
|
||||
elif self._input_state == InputState.HIGH_BYTE:
|
||||
@@ -253,34 +316,47 @@ class UniversalDetector(object):
|
||||
max_prober = prober
|
||||
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
|
||||
charset_name = max_prober.charset_name
|
||||
lower_charset_name = max_prober.charset_name.lower()
|
||||
assert charset_name is not None
|
||||
lower_charset_name = charset_name.lower()
|
||||
confidence = max_prober.get_confidence()
|
||||
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||
# extra Windows-specific bytes
|
||||
if lower_charset_name.startswith('iso-8859'):
|
||||
if lower_charset_name.startswith("iso-8859"):
|
||||
if self._has_win_bytes:
|
||||
charset_name = self.ISO_WIN_MAP.get(lower_charset_name,
|
||||
charset_name)
|
||||
self.result = {'encoding': charset_name,
|
||||
'confidence': confidence,
|
||||
'language': max_prober.language}
|
||||
charset_name = self.ISO_WIN_MAP.get(
|
||||
lower_charset_name, charset_name
|
||||
)
|
||||
# Rename legacy encodings with superset encodings if asked
|
||||
if self.should_rename_legacy:
|
||||
charset_name = self.LEGACY_MAP.get(
|
||||
(charset_name or "").lower(), charset_name
|
||||
)
|
||||
self.result = {
|
||||
"encoding": charset_name,
|
||||
"confidence": confidence,
|
||||
"language": max_prober.language,
|
||||
}
|
||||
|
||||
# Log all prober confidences if none met MINIMUM_THRESHOLD
|
||||
if self.logger.getEffectiveLevel() <= logging.DEBUG:
|
||||
if self.result['encoding'] is None:
|
||||
self.logger.debug('no probers hit minimum threshold')
|
||||
if self.result["encoding"] is None:
|
||||
self.logger.debug("no probers hit minimum threshold")
|
||||
for group_prober in self._charset_probers:
|
||||
if not group_prober:
|
||||
continue
|
||||
if isinstance(group_prober, CharSetGroupProber):
|
||||
for prober in group_prober.probers:
|
||||
self.logger.debug('%s %s confidence = %s',
|
||||
prober.charset_name,
|
||||
prober.language,
|
||||
prober.get_confidence())
|
||||
self.logger.debug(
|
||||
"%s %s confidence = %s",
|
||||
prober.charset_name,
|
||||
prober.language,
|
||||
prober.get_confidence(),
|
||||
)
|
||||
else:
|
||||
self.logger.debug('%s %s confidence = %s',
|
||||
group_prober.charset_name,
|
||||
group_prober.language,
|
||||
group_prober.get_confidence())
|
||||
self.logger.debug(
|
||||
"%s %s confidence = %s",
|
||||
group_prober.charset_name,
|
||||
group_prober.language,
|
||||
group_prober.get_confidence(),
|
||||
)
|
||||
return self.result
|
||||
|
||||
@@ -25,45 +25,46 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState, MachineState
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .mbcssm import UTF8_SM_MODEL
|
||||
from typing import Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import MachineState, ProbingState
|
||||
from .mbcssm import UTF8_SM_MODEL
|
||||
|
||||
|
||||
class UTF8Prober(CharSetProber):
|
||||
ONE_CHAR_PROB = 0.5
|
||||
|
||||
def __init__(self):
|
||||
super(UTF8Prober, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
|
||||
self._num_mb_chars = None
|
||||
self._num_mb_chars = 0
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
super(UTF8Prober, self).reset()
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self.coding_sm.reset()
|
||||
self._num_mb_chars = 0
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "utf-8"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return ""
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
for c in byte_str:
|
||||
coding_state = self.coding_sm.next_state(c)
|
||||
if coding_state == MachineState.ERROR:
|
||||
self._state = ProbingState.NOT_ME
|
||||
break
|
||||
elif coding_state == MachineState.ITS_ME:
|
||||
if coding_state == MachineState.ITS_ME:
|
||||
self._state = ProbingState.FOUND_IT
|
||||
break
|
||||
elif coding_state == MachineState.START:
|
||||
if coding_state == MachineState.START:
|
||||
if self.coding_sm.get_current_charlen() >= 2:
|
||||
self._num_mb_chars += 1
|
||||
|
||||
@@ -73,10 +74,9 @@ class UTF8Prober(CharSetProber):
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
unlike = 0.99
|
||||
if self._num_mb_chars < 6:
|
||||
unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars
|
||||
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars
|
||||
return 1.0 - unlike
|
||||
else:
|
||||
return unlike
|
||||
return unlike
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
"""
|
||||
This module exists only to simplify retrieving the version number of chardet
|
||||
from within setup.py and from chardet subpackages.
|
||||
from within setuptools and from chardet subpackages.
|
||||
|
||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||
"""
|
||||
|
||||
__version__ = "4.0.0"
|
||||
VERSION = __version__.split('.')
|
||||
__version__ = "5.1.0"
|
||||
VERSION = __version__.split(".")
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
||||
from .initialise import init, deinit, reinit, colorama_text
|
||||
from .initialise import init, deinit, reinit, colorama_text, just_fix_windows_console
|
||||
from .ansi import Fore, Back, Style, Cursor
|
||||
from .ansitowin32 import AnsiToWin32
|
||||
|
||||
__version__ = '0.4.4'
|
||||
__version__ = '0.4.6'
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ import sys
|
||||
import os
|
||||
|
||||
from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style, BEL
|
||||
from .winterm import WinTerm, WinColor, WinStyle
|
||||
from .winterm import enable_vt_processing, WinTerm, WinColor, WinStyle
|
||||
from .win32 import windll, winapi_test
|
||||
|
||||
|
||||
@@ -37,6 +37,12 @@ class StreamWrapper(object):
|
||||
def __exit__(self, *args, **kwargs):
|
||||
return self.__wrapped.__exit__(*args, **kwargs)
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.__dict__ = state
|
||||
|
||||
def __getstate__(self):
|
||||
return self.__dict__
|
||||
|
||||
def write(self, text):
|
||||
self.__convertor.write(text)
|
||||
|
||||
@@ -57,7 +63,9 @@ class StreamWrapper(object):
|
||||
stream = self.__wrapped
|
||||
try:
|
||||
return stream.closed
|
||||
except AttributeError:
|
||||
# AttributeError in the case that the stream doesn't support being closed
|
||||
# ValueError for the case that the stream has already been detached when atexit runs
|
||||
except (AttributeError, ValueError):
|
||||
return True
|
||||
|
||||
|
||||
@@ -86,15 +94,22 @@ class AnsiToWin32(object):
|
||||
# (e.g. Cygwin Terminal). In this case it's up to the terminal
|
||||
# to support the ANSI codes.
|
||||
conversion_supported = on_windows and winapi_test()
|
||||
try:
|
||||
fd = wrapped.fileno()
|
||||
except Exception:
|
||||
fd = -1
|
||||
system_has_native_ansi = not on_windows or enable_vt_processing(fd)
|
||||
have_tty = not self.stream.closed and self.stream.isatty()
|
||||
need_conversion = conversion_supported and not system_has_native_ansi
|
||||
|
||||
# should we strip ANSI sequences from our output?
|
||||
if strip is None:
|
||||
strip = conversion_supported or (not self.stream.closed and not self.stream.isatty())
|
||||
strip = need_conversion or not have_tty
|
||||
self.strip = strip
|
||||
|
||||
# should we should convert ANSI sequences into win32 calls?
|
||||
if convert is None:
|
||||
convert = conversion_supported and not self.stream.closed and self.stream.isatty()
|
||||
convert = need_conversion and have_tty
|
||||
self.convert = convert
|
||||
|
||||
# dict of ansi codes to win32 functions and parameters
|
||||
@@ -256,3 +271,7 @@ class AnsiToWin32(object):
|
||||
if params[0] in '02':
|
||||
winterm.set_title(params[1])
|
||||
return text
|
||||
|
||||
|
||||
def flush(self):
|
||||
self.wrapped.flush()
|
||||
|
||||
@@ -6,13 +6,27 @@ import sys
|
||||
from .ansitowin32 import AnsiToWin32
|
||||
|
||||
|
||||
orig_stdout = None
|
||||
orig_stderr = None
|
||||
def _wipe_internal_state_for_tests():
|
||||
global orig_stdout, orig_stderr
|
||||
orig_stdout = None
|
||||
orig_stderr = None
|
||||
|
||||
wrapped_stdout = None
|
||||
wrapped_stderr = None
|
||||
global wrapped_stdout, wrapped_stderr
|
||||
wrapped_stdout = None
|
||||
wrapped_stderr = None
|
||||
|
||||
atexit_done = False
|
||||
global atexit_done
|
||||
atexit_done = False
|
||||
|
||||
global fixed_windows_console
|
||||
fixed_windows_console = False
|
||||
|
||||
try:
|
||||
# no-op if it wasn't registered
|
||||
atexit.unregister(reset_all)
|
||||
except AttributeError:
|
||||
# python 2: no atexit.unregister. Oh well, we did our best.
|
||||
pass
|
||||
|
||||
|
||||
def reset_all():
|
||||
@@ -55,6 +69,29 @@ def deinit():
|
||||
sys.stderr = orig_stderr
|
||||
|
||||
|
||||
def just_fix_windows_console():
|
||||
global fixed_windows_console
|
||||
|
||||
if sys.platform != "win32":
|
||||
return
|
||||
if fixed_windows_console:
|
||||
return
|
||||
if wrapped_stdout is not None or wrapped_stderr is not None:
|
||||
# Someone already ran init() and it did stuff, so we won't second-guess them
|
||||
return
|
||||
|
||||
# On newer versions of Windows, AnsiToWin32.__init__ will implicitly enable the
|
||||
# native ANSI support in the console as a side-effect. We only need to actually
|
||||
# replace sys.stdout/stderr if we're in the old-style conversion mode.
|
||||
new_stdout = AnsiToWin32(sys.stdout, convert=None, strip=None, autoreset=False)
|
||||
if new_stdout.convert:
|
||||
sys.stdout = new_stdout
|
||||
new_stderr = AnsiToWin32(sys.stderr, convert=None, strip=None, autoreset=False)
|
||||
if new_stderr.convert:
|
||||
sys.stderr = new_stderr
|
||||
|
||||
fixed_windows_console = True
|
||||
|
||||
@contextlib.contextmanager
|
||||
def colorama_text(*args, **kwargs):
|
||||
init(*args, **kwargs)
|
||||
@@ -78,3 +115,7 @@ def wrap_stream(stream, convert, strip, autoreset, wrap):
|
||||
if wrapper.should_wrap():
|
||||
stream = wrapper.stream
|
||||
return stream
|
||||
|
||||
|
||||
# Use this for initial setup as well, to reduce code duplication
|
||||
_wipe_internal_state_for_tests()
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
STDOUT = -11
|
||||
STDERR = -12
|
||||
|
||||
ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
|
||||
|
||||
try:
|
||||
import ctypes
|
||||
from ctypes import LibraryLoader
|
||||
@@ -89,6 +91,20 @@ else:
|
||||
]
|
||||
_SetConsoleTitleW.restype = wintypes.BOOL
|
||||
|
||||
_GetConsoleMode = windll.kernel32.GetConsoleMode
|
||||
_GetConsoleMode.argtypes = [
|
||||
wintypes.HANDLE,
|
||||
POINTER(wintypes.DWORD)
|
||||
]
|
||||
_GetConsoleMode.restype = wintypes.BOOL
|
||||
|
||||
_SetConsoleMode = windll.kernel32.SetConsoleMode
|
||||
_SetConsoleMode.argtypes = [
|
||||
wintypes.HANDLE,
|
||||
wintypes.DWORD
|
||||
]
|
||||
_SetConsoleMode.restype = wintypes.BOOL
|
||||
|
||||
def _winapi_test(handle):
|
||||
csbi = CONSOLE_SCREEN_BUFFER_INFO()
|
||||
success = _GetConsoleScreenBufferInfo(
|
||||
@@ -150,3 +166,15 @@ else:
|
||||
|
||||
def SetConsoleTitle(title):
|
||||
return _SetConsoleTitleW(title)
|
||||
|
||||
def GetConsoleMode(handle):
|
||||
mode = wintypes.DWORD()
|
||||
success = _GetConsoleMode(handle, byref(mode))
|
||||
if not success:
|
||||
raise ctypes.WinError()
|
||||
return mode.value
|
||||
|
||||
def SetConsoleMode(handle, mode):
|
||||
success = _SetConsoleMode(handle, mode)
|
||||
if not success:
|
||||
raise ctypes.WinError()
|
||||
|
||||
@@ -1,7 +1,13 @@
|
||||
# Copyright Jonathan Hartley 2013. BSD 3-Clause license, see LICENSE file.
|
||||
from . import win32
|
||||
try:
|
||||
from msvcrt import get_osfhandle
|
||||
except ImportError:
|
||||
def get_osfhandle(_):
|
||||
raise OSError("This isn't windows!")
|
||||
|
||||
|
||||
from . import win32
|
||||
|
||||
# from wincon.h
|
||||
class WinColor(object):
|
||||
BLACK = 0
|
||||
@@ -167,3 +173,23 @@ class WinTerm(object):
|
||||
|
||||
def set_title(self, title):
|
||||
win32.SetConsoleTitle(title)
|
||||
|
||||
|
||||
def enable_vt_processing(fd):
|
||||
if win32.windll is None or not win32.winapi_test():
|
||||
return False
|
||||
|
||||
try:
|
||||
handle = get_osfhandle(fd)
|
||||
mode = win32.GetConsoleMode(handle)
|
||||
win32.SetConsoleMode(
|
||||
handle,
|
||||
mode | win32.ENABLE_VIRTUAL_TERMINAL_PROCESSING,
|
||||
)
|
||||
|
||||
mode = win32.GetConsoleMode(handle)
|
||||
if mode & win32.ENABLE_VIRTUAL_TERMINAL_PROCESSING:
|
||||
return True
|
||||
# Can get TypeError in testsuite where 'fd' is a Mock()
|
||||
except (OSError, TypeError):
|
||||
return False
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright (C) 2012-2019 Vinay Sajip.
|
||||
# Copyright (C) 2012-2022 Vinay Sajip.
|
||||
# Licensed to the Python Software Foundation under a contributor agreement.
|
||||
# See LICENSE.txt and CONTRIBUTORS.txt.
|
||||
#
|
||||
import logging
|
||||
|
||||
__version__ = '0.3.2'
|
||||
__version__ = '0.3.6'
|
||||
|
||||
class DistlibException(Exception):
|
||||
pass
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
"""Modules copied from Python 3 standard libraries, for internal use only.
|
||||
|
||||
Individual classes and functions are found in d2._backport.misc. Intended
|
||||
usage is to always import things missing from 3.1 from that module: the
|
||||
built-in/stdlib objects will be used if found.
|
||||
"""
|
||||
@@ -1,41 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright (C) 2012 The Python Software Foundation.
|
||||
# See LICENSE.txt and CONTRIBUTORS.txt.
|
||||
#
|
||||
"""Backports for individual classes and functions."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
__all__ = ['cache_from_source', 'callable', 'fsencode']
|
||||
|
||||
|
||||
try:
|
||||
from imp import cache_from_source
|
||||
except ImportError:
|
||||
def cache_from_source(py_file, debug=__debug__):
|
||||
ext = debug and 'c' or 'o'
|
||||
return py_file + ext
|
||||
|
||||
|
||||
try:
|
||||
callable = callable
|
||||
except NameError:
|
||||
from collections import Callable
|
||||
|
||||
def callable(obj):
|
||||
return isinstance(obj, Callable)
|
||||
|
||||
|
||||
try:
|
||||
fsencode = os.fsencode
|
||||
except AttributeError:
|
||||
def fsencode(filename):
|
||||
if isinstance(filename, bytes):
|
||||
return filename
|
||||
elif isinstance(filename, str):
|
||||
return filename.encode(sys.getfilesystemencoding())
|
||||
else:
|
||||
raise TypeError("expect bytes or str, not %s" %
|
||||
type(filename).__name__)
|
||||
@@ -1,764 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright (C) 2012 The Python Software Foundation.
|
||||
# See LICENSE.txt and CONTRIBUTORS.txt.
|
||||
#
|
||||
"""Utility functions for copying and archiving files and directory trees.
|
||||
|
||||
XXX The functions here don't copy the resource fork or other metadata on Mac.
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import stat
|
||||
from os.path import abspath
|
||||
import fnmatch
|
||||
try:
|
||||
from collections.abc import Callable
|
||||
except ImportError:
|
||||
from collections import Callable
|
||||
import errno
|
||||
from . import tarfile
|
||||
|
||||
try:
|
||||
import bz2
|
||||
_BZ2_SUPPORTED = True
|
||||
except ImportError:
|
||||
_BZ2_SUPPORTED = False
|
||||
|
||||
try:
|
||||
from pwd import getpwnam
|
||||
except ImportError:
|
||||
getpwnam = None
|
||||
|
||||
try:
|
||||
from grp import getgrnam
|
||||
except ImportError:
|
||||
getgrnam = None
|
||||
|
||||
__all__ = ["copyfileobj", "copyfile", "copymode", "copystat", "copy", "copy2",
|
||||
"copytree", "move", "rmtree", "Error", "SpecialFileError",
|
||||
"ExecError", "make_archive", "get_archive_formats",
|
||||
"register_archive_format", "unregister_archive_format",
|
||||
"get_unpack_formats", "register_unpack_format",
|
||||
"unregister_unpack_format", "unpack_archive", "ignore_patterns"]
|
||||
|
||||
class Error(EnvironmentError):
|
||||
pass
|
||||
|
||||
class SpecialFileError(EnvironmentError):
|
||||
"""Raised when trying to do a kind of operation (e.g. copying) which is
|
||||
not supported on a special file (e.g. a named pipe)"""
|
||||
|
||||
class ExecError(EnvironmentError):
|
||||
"""Raised when a command could not be executed"""
|
||||
|
||||
class ReadError(EnvironmentError):
|
||||
"""Raised when an archive cannot be read"""
|
||||
|
||||
class RegistryError(Exception):
|
||||
"""Raised when a registry operation with the archiving
|
||||
and unpacking registries fails"""
|
||||
|
||||
|
||||
try:
|
||||
WindowsError
|
||||
except NameError:
|
||||
WindowsError = None
|
||||
|
||||
def copyfileobj(fsrc, fdst, length=16*1024):
|
||||
"""copy data from file-like object fsrc to file-like object fdst"""
|
||||
while 1:
|
||||
buf = fsrc.read(length)
|
||||
if not buf:
|
||||
break
|
||||
fdst.write(buf)
|
||||
|
||||
def _samefile(src, dst):
|
||||
# Macintosh, Unix.
|
||||
if hasattr(os.path, 'samefile'):
|
||||
try:
|
||||
return os.path.samefile(src, dst)
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
# All other platforms: check for same pathname.
|
||||
return (os.path.normcase(os.path.abspath(src)) ==
|
||||
os.path.normcase(os.path.abspath(dst)))
|
||||
|
||||
def copyfile(src, dst):
|
||||
"""Copy data from src to dst"""
|
||||
if _samefile(src, dst):
|
||||
raise Error("`%s` and `%s` are the same file" % (src, dst))
|
||||
|
||||
for fn in [src, dst]:
|
||||
try:
|
||||
st = os.stat(fn)
|
||||
except OSError:
|
||||
# File most likely does not exist
|
||||
pass
|
||||
else:
|
||||
# XXX What about other special files? (sockets, devices...)
|
||||
if stat.S_ISFIFO(st.st_mode):
|
||||
raise SpecialFileError("`%s` is a named pipe" % fn)
|
||||
|
||||
with open(src, 'rb') as fsrc:
|
||||
with open(dst, 'wb') as fdst:
|
||||
copyfileobj(fsrc, fdst)
|
||||
|
||||
def copymode(src, dst):
|
||||
"""Copy mode bits from src to dst"""
|
||||
if hasattr(os, 'chmod'):
|
||||
st = os.stat(src)
|
||||
mode = stat.S_IMODE(st.st_mode)
|
||||
os.chmod(dst, mode)
|
||||
|
||||
def copystat(src, dst):
|
||||
"""Copy all stat info (mode bits, atime, mtime, flags) from src to dst"""
|
||||
st = os.stat(src)
|
||||
mode = stat.S_IMODE(st.st_mode)
|
||||
if hasattr(os, 'utime'):
|
||||
os.utime(dst, (st.st_atime, st.st_mtime))
|
||||
if hasattr(os, 'chmod'):
|
||||
os.chmod(dst, mode)
|
||||
if hasattr(os, 'chflags') and hasattr(st, 'st_flags'):
|
||||
try:
|
||||
os.chflags(dst, st.st_flags)
|
||||
except OSError as why:
|
||||
if (not hasattr(errno, 'EOPNOTSUPP') or
|
||||
why.errno != errno.EOPNOTSUPP):
|
||||
raise
|
||||
|
||||
def copy(src, dst):
|
||||
"""Copy data and mode bits ("cp src dst").
|
||||
|
||||
The destination may be a directory.
|
||||
|
||||
"""
|
||||
if os.path.isdir(dst):
|
||||
dst = os.path.join(dst, os.path.basename(src))
|
||||
copyfile(src, dst)
|
||||
copymode(src, dst)
|
||||
|
||||
def copy2(src, dst):
|
||||
"""Copy data and all stat info ("cp -p src dst").
|
||||
|
||||
The destination may be a directory.
|
||||
|
||||
"""
|
||||
if os.path.isdir(dst):
|
||||
dst = os.path.join(dst, os.path.basename(src))
|
||||
copyfile(src, dst)
|
||||
copystat(src, dst)
|
||||
|
||||
def ignore_patterns(*patterns):
|
||||
"""Function that can be used as copytree() ignore parameter.
|
||||
|
||||
Patterns is a sequence of glob-style patterns
|
||||
that are used to exclude files"""
|
||||
def _ignore_patterns(path, names):
|
||||
ignored_names = []
|
||||
for pattern in patterns:
|
||||
ignored_names.extend(fnmatch.filter(names, pattern))
|
||||
return set(ignored_names)
|
||||
return _ignore_patterns
|
||||
|
||||
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
|
||||
ignore_dangling_symlinks=False):
|
||||
"""Recursively copy a directory tree.
|
||||
|
||||
The destination directory must not already exist.
|
||||
If exception(s) occur, an Error is raised with a list of reasons.
|
||||
|
||||
If the optional symlinks flag is true, symbolic links in the
|
||||
source tree result in symbolic links in the destination tree; if
|
||||
it is false, the contents of the files pointed to by symbolic
|
||||
links are copied. If the file pointed by the symlink doesn't
|
||||
exist, an exception will be added in the list of errors raised in
|
||||
an Error exception at the end of the copy process.
|
||||
|
||||
You can set the optional ignore_dangling_symlinks flag to true if you
|
||||
want to silence this exception. Notice that this has no effect on
|
||||
platforms that don't support os.symlink.
|
||||
|
||||
The optional ignore argument is a callable. If given, it
|
||||
is called with the `src` parameter, which is the directory
|
||||
being visited by copytree(), and `names` which is the list of
|
||||
`src` contents, as returned by os.listdir():
|
||||
|
||||
callable(src, names) -> ignored_names
|
||||
|
||||
Since copytree() is called recursively, the callable will be
|
||||
called once for each directory that is copied. It returns a
|
||||
list of names relative to the `src` directory that should
|
||||
not be copied.
|
||||
|
||||
The optional copy_function argument is a callable that will be used
|
||||
to copy each file. It will be called with the source path and the
|
||||
destination path as arguments. By default, copy2() is used, but any
|
||||
function that supports the same signature (like copy()) can be used.
|
||||
|
||||
"""
|
||||
names = os.listdir(src)
|
||||
if ignore is not None:
|
||||
ignored_names = ignore(src, names)
|
||||
else:
|
||||
ignored_names = set()
|
||||
|
||||
os.makedirs(dst)
|
||||
errors = []
|
||||
for name in names:
|
||||
if name in ignored_names:
|
||||
continue
|
||||
srcname = os.path.join(src, name)
|
||||
dstname = os.path.join(dst, name)
|
||||
try:
|
||||
if os.path.islink(srcname):
|
||||
linkto = os.readlink(srcname)
|
||||
if symlinks:
|
||||
os.symlink(linkto, dstname)
|
||||
else:
|
||||
# ignore dangling symlink if the flag is on
|
||||
if not os.path.exists(linkto) and ignore_dangling_symlinks:
|
||||
continue
|
||||
# otherwise let the copy occurs. copy2 will raise an error
|
||||
copy_function(srcname, dstname)
|
||||
elif os.path.isdir(srcname):
|
||||
copytree(srcname, dstname, symlinks, ignore, copy_function)
|
||||
else:
|
||||
# Will raise a SpecialFileError for unsupported file types
|
||||
copy_function(srcname, dstname)
|
||||
# catch the Error from the recursive copytree so that we can
|
||||
# continue with other files
|
||||
except Error as err:
|
||||
errors.extend(err.args[0])
|
||||
except EnvironmentError as why:
|
||||
errors.append((srcname, dstname, str(why)))
|
||||
try:
|
||||
copystat(src, dst)
|
||||
except OSError as why:
|
||||
if WindowsError is not None and isinstance(why, WindowsError):
|
||||
# Copying file access times may fail on Windows
|
||||
pass
|
||||
else:
|
||||
errors.extend((src, dst, str(why)))
|
||||
if errors:
|
||||
raise Error(errors)
|
||||
|
||||
def rmtree(path, ignore_errors=False, onerror=None):
|
||||
"""Recursively delete a directory tree.
|
||||
|
||||
If ignore_errors is set, errors are ignored; otherwise, if onerror
|
||||
is set, it is called to handle the error with arguments (func,
|
||||
path, exc_info) where func is os.listdir, os.remove, or os.rmdir;
|
||||
path is the argument to that function that caused it to fail; and
|
||||
exc_info is a tuple returned by sys.exc_info(). If ignore_errors
|
||||
is false and onerror is None, an exception is raised.
|
||||
|
||||
"""
|
||||
if ignore_errors:
|
||||
def onerror(*args):
|
||||
pass
|
||||
elif onerror is None:
|
||||
def onerror(*args):
|
||||
raise
|
||||
try:
|
||||
if os.path.islink(path):
|
||||
# symlinks to directories are forbidden, see bug #1669
|
||||
raise OSError("Cannot call rmtree on a symbolic link")
|
||||
except OSError:
|
||||
onerror(os.path.islink, path, sys.exc_info())
|
||||
# can't continue even if onerror hook returns
|
||||
return
|
||||
names = []
|
||||
try:
|
||||
names = os.listdir(path)
|
||||
except os.error:
|
||||
onerror(os.listdir, path, sys.exc_info())
|
||||
for name in names:
|
||||
fullname = os.path.join(path, name)
|
||||
try:
|
||||
mode = os.lstat(fullname).st_mode
|
||||
except os.error:
|
||||
mode = 0
|
||||
if stat.S_ISDIR(mode):
|
||||
rmtree(fullname, ignore_errors, onerror)
|
||||
else:
|
||||
try:
|
||||
os.remove(fullname)
|
||||
except os.error:
|
||||
onerror(os.remove, fullname, sys.exc_info())
|
||||
try:
|
||||
os.rmdir(path)
|
||||
except os.error:
|
||||
onerror(os.rmdir, path, sys.exc_info())
|
||||
|
||||
|
||||
def _basename(path):
|
||||
# A basename() variant which first strips the trailing slash, if present.
|
||||
# Thus we always get the last component of the path, even for directories.
|
||||
return os.path.basename(path.rstrip(os.path.sep))
|
||||
|
||||
def move(src, dst):
|
||||
"""Recursively move a file or directory to another location. This is
|
||||
similar to the Unix "mv" command.
|
||||
|
||||
If the destination is a directory or a symlink to a directory, the source
|
||||
is moved inside the directory. The destination path must not already
|
||||
exist.
|
||||
|
||||
If the destination already exists but is not a directory, it may be
|
||||
overwritten depending on os.rename() semantics.
|
||||
|
||||
If the destination is on our current filesystem, then rename() is used.
|
||||
Otherwise, src is copied to the destination and then removed.
|
||||
A lot more could be done here... A look at a mv.c shows a lot of
|
||||
the issues this implementation glosses over.
|
||||
|
||||
"""
|
||||
real_dst = dst
|
||||
if os.path.isdir(dst):
|
||||
if _samefile(src, dst):
|
||||
# We might be on a case insensitive filesystem,
|
||||
# perform the rename anyway.
|
||||
os.rename(src, dst)
|
||||
return
|
||||
|
||||
real_dst = os.path.join(dst, _basename(src))
|
||||
if os.path.exists(real_dst):
|
||||
raise Error("Destination path '%s' already exists" % real_dst)
|
||||
try:
|
||||
os.rename(src, real_dst)
|
||||
except OSError:
|
||||
if os.path.isdir(src):
|
||||
if _destinsrc(src, dst):
|
||||
raise Error("Cannot move a directory '%s' into itself '%s'." % (src, dst))
|
||||
copytree(src, real_dst, symlinks=True)
|
||||
rmtree(src)
|
||||
else:
|
||||
copy2(src, real_dst)
|
||||
os.unlink(src)
|
||||
|
||||
def _destinsrc(src, dst):
|
||||
src = abspath(src)
|
||||
dst = abspath(dst)
|
||||
if not src.endswith(os.path.sep):
|
||||
src += os.path.sep
|
||||
if not dst.endswith(os.path.sep):
|
||||
dst += os.path.sep
|
||||
return dst.startswith(src)
|
||||
|
||||
def _get_gid(name):
|
||||
"""Returns a gid, given a group name."""
|
||||
if getgrnam is None or name is None:
|
||||
return None
|
||||
try:
|
||||
result = getgrnam(name)
|
||||
except KeyError:
|
||||
result = None
|
||||
if result is not None:
|
||||
return result[2]
|
||||
return None
|
||||
|
||||
def _get_uid(name):
|
||||
"""Returns an uid, given a user name."""
|
||||
if getpwnam is None or name is None:
|
||||
return None
|
||||
try:
|
||||
result = getpwnam(name)
|
||||
except KeyError:
|
||||
result = None
|
||||
if result is not None:
|
||||
return result[2]
|
||||
return None
|
||||
|
||||
def _make_tarball(base_name, base_dir, compress="gzip", verbose=0, dry_run=0,
|
||||
owner=None, group=None, logger=None):
|
||||
"""Create a (possibly compressed) tar file from all the files under
|
||||
'base_dir'.
|
||||
|
||||
'compress' must be "gzip" (the default), "bzip2", or None.
|
||||
|
||||
'owner' and 'group' can be used to define an owner and a group for the
|
||||
archive that is being built. If not provided, the current owner and group
|
||||
will be used.
|
||||
|
||||
The output tar file will be named 'base_name' + ".tar", possibly plus
|
||||
the appropriate compression extension (".gz", or ".bz2").
|
||||
|
||||
Returns the output filename.
|
||||
"""
|
||||
tar_compression = {'gzip': 'gz', None: ''}
|
||||
compress_ext = {'gzip': '.gz'}
|
||||
|
||||
if _BZ2_SUPPORTED:
|
||||
tar_compression['bzip2'] = 'bz2'
|
||||
compress_ext['bzip2'] = '.bz2'
|
||||
|
||||
# flags for compression program, each element of list will be an argument
|
||||
if compress is not None and compress not in compress_ext:
|
||||
raise ValueError("bad value for 'compress', or compression format not "
|
||||
"supported : {0}".format(compress))
|
||||
|
||||
archive_name = base_name + '.tar' + compress_ext.get(compress, '')
|
||||
archive_dir = os.path.dirname(archive_name)
|
||||
|
||||
if not os.path.exists(archive_dir):
|
||||
if logger is not None:
|
||||
logger.info("creating %s", archive_dir)
|
||||
if not dry_run:
|
||||
os.makedirs(archive_dir)
|
||||
|
||||
# creating the tarball
|
||||
if logger is not None:
|
||||
logger.info('Creating tar archive')
|
||||
|
||||
uid = _get_uid(owner)
|
||||
gid = _get_gid(group)
|
||||
|
||||
def _set_uid_gid(tarinfo):
|
||||
if gid is not None:
|
||||
tarinfo.gid = gid
|
||||
tarinfo.gname = group
|
||||
if uid is not None:
|
||||
tarinfo.uid = uid
|
||||
tarinfo.uname = owner
|
||||
return tarinfo
|
||||
|
||||
if not dry_run:
|
||||
tar = tarfile.open(archive_name, 'w|%s' % tar_compression[compress])
|
||||
try:
|
||||
tar.add(base_dir, filter=_set_uid_gid)
|
||||
finally:
|
||||
tar.close()
|
||||
|
||||
return archive_name
|
||||
|
||||
def _call_external_zip(base_dir, zip_filename, verbose=False, dry_run=False):
|
||||
# XXX see if we want to keep an external call here
|
||||
if verbose:
|
||||
zipoptions = "-r"
|
||||
else:
|
||||
zipoptions = "-rq"
|
||||
from distutils.errors import DistutilsExecError
|
||||
from distutils.spawn import spawn
|
||||
try:
|
||||
spawn(["zip", zipoptions, zip_filename, base_dir], dry_run=dry_run)
|
||||
except DistutilsExecError:
|
||||
# XXX really should distinguish between "couldn't find
|
||||
# external 'zip' command" and "zip failed".
|
||||
raise ExecError("unable to create zip file '%s': "
|
||||
"could neither import the 'zipfile' module nor "
|
||||
"find a standalone zip utility") % zip_filename
|
||||
|
||||
def _make_zipfile(base_name, base_dir, verbose=0, dry_run=0, logger=None):
|
||||
"""Create a zip file from all the files under 'base_dir'.
|
||||
|
||||
The output zip file will be named 'base_name' + ".zip". Uses either the
|
||||
"zipfile" Python module (if available) or the InfoZIP "zip" utility
|
||||
(if installed and found on the default search path). If neither tool is
|
||||
available, raises ExecError. Returns the name of the output zip
|
||||
file.
|
||||
"""
|
||||
zip_filename = base_name + ".zip"
|
||||
archive_dir = os.path.dirname(base_name)
|
||||
|
||||
if not os.path.exists(archive_dir):
|
||||
if logger is not None:
|
||||
logger.info("creating %s", archive_dir)
|
||||
if not dry_run:
|
||||
os.makedirs(archive_dir)
|
||||
|
||||
# If zipfile module is not available, try spawning an external 'zip'
|
||||
# command.
|
||||
try:
|
||||
import zipfile
|
||||
except ImportError:
|
||||
zipfile = None
|
||||
|
||||
if zipfile is None:
|
||||
_call_external_zip(base_dir, zip_filename, verbose, dry_run)
|
||||
else:
|
||||
if logger is not None:
|
||||
logger.info("creating '%s' and adding '%s' to it",
|
||||
zip_filename, base_dir)
|
||||
|
||||
if not dry_run:
|
||||
zip = zipfile.ZipFile(zip_filename, "w",
|
||||
compression=zipfile.ZIP_DEFLATED)
|
||||
|
||||
for dirpath, dirnames, filenames in os.walk(base_dir):
|
||||
for name in filenames:
|
||||
path = os.path.normpath(os.path.join(dirpath, name))
|
||||
if os.path.isfile(path):
|
||||
zip.write(path, path)
|
||||
if logger is not None:
|
||||
logger.info("adding '%s'", path)
|
||||
zip.close()
|
||||
|
||||
return zip_filename
|
||||
|
||||
_ARCHIVE_FORMATS = {
|
||||
'gztar': (_make_tarball, [('compress', 'gzip')], "gzip'ed tar-file"),
|
||||
'bztar': (_make_tarball, [('compress', 'bzip2')], "bzip2'ed tar-file"),
|
||||
'tar': (_make_tarball, [('compress', None)], "uncompressed tar file"),
|
||||
'zip': (_make_zipfile, [], "ZIP file"),
|
||||
}
|
||||
|
||||
if _BZ2_SUPPORTED:
|
||||
_ARCHIVE_FORMATS['bztar'] = (_make_tarball, [('compress', 'bzip2')],
|
||||
"bzip2'ed tar-file")
|
||||
|
||||
def get_archive_formats():
|
||||
"""Returns a list of supported formats for archiving and unarchiving.
|
||||
|
||||
Each element of the returned sequence is a tuple (name, description)
|
||||
"""
|
||||
formats = [(name, registry[2]) for name, registry in
|
||||
_ARCHIVE_FORMATS.items()]
|
||||
formats.sort()
|
||||
return formats
|
||||
|
||||
def register_archive_format(name, function, extra_args=None, description=''):
|
||||
"""Registers an archive format.
|
||||
|
||||
name is the name of the format. function is the callable that will be
|
||||
used to create archives. If provided, extra_args is a sequence of
|
||||
(name, value) tuples that will be passed as arguments to the callable.
|
||||
description can be provided to describe the format, and will be returned
|
||||
by the get_archive_formats() function.
|
||||
"""
|
||||
if extra_args is None:
|
||||
extra_args = []
|
||||
if not isinstance(function, Callable):
|
||||
raise TypeError('The %s object is not callable' % function)
|
||||
if not isinstance(extra_args, (tuple, list)):
|
||||
raise TypeError('extra_args needs to be a sequence')
|
||||
for element in extra_args:
|
||||
if not isinstance(element, (tuple, list)) or len(element) !=2:
|
||||
raise TypeError('extra_args elements are : (arg_name, value)')
|
||||
|
||||
_ARCHIVE_FORMATS[name] = (function, extra_args, description)
|
||||
|
||||
def unregister_archive_format(name):
|
||||
del _ARCHIVE_FORMATS[name]
|
||||
|
||||
def make_archive(base_name, format, root_dir=None, base_dir=None, verbose=0,
|
||||
dry_run=0, owner=None, group=None, logger=None):
|
||||
"""Create an archive file (eg. zip or tar).
|
||||
|
||||
'base_name' is the name of the file to create, minus any format-specific
|
||||
extension; 'format' is the archive format: one of "zip", "tar", "bztar"
|
||||
or "gztar".
|
||||
|
||||
'root_dir' is a directory that will be the root directory of the
|
||||
archive; ie. we typically chdir into 'root_dir' before creating the
|
||||
archive. 'base_dir' is the directory where we start archiving from;
|
||||
ie. 'base_dir' will be the common prefix of all files and
|
||||
directories in the archive. 'root_dir' and 'base_dir' both default
|
||||
to the current directory. Returns the name of the archive file.
|
||||
|
||||
'owner' and 'group' are used when creating a tar archive. By default,
|
||||
uses the current owner and group.
|
||||
"""
|
||||
save_cwd = os.getcwd()
|
||||
if root_dir is not None:
|
||||
if logger is not None:
|
||||
logger.debug("changing into '%s'", root_dir)
|
||||
base_name = os.path.abspath(base_name)
|
||||
if not dry_run:
|
||||
os.chdir(root_dir)
|
||||
|
||||
if base_dir is None:
|
||||
base_dir = os.curdir
|
||||
|
||||
kwargs = {'dry_run': dry_run, 'logger': logger}
|
||||
|
||||
try:
|
||||
format_info = _ARCHIVE_FORMATS[format]
|
||||
except KeyError:
|
||||
raise ValueError("unknown archive format '%s'" % format)
|
||||
|
||||
func = format_info[0]
|
||||
for arg, val in format_info[1]:
|
||||
kwargs[arg] = val
|
||||
|
||||
if format != 'zip':
|
||||
kwargs['owner'] = owner
|
||||
kwargs['group'] = group
|
||||
|
||||
try:
|
||||
filename = func(base_name, base_dir, **kwargs)
|
||||
finally:
|
||||
if root_dir is not None:
|
||||
if logger is not None:
|
||||
logger.debug("changing back to '%s'", save_cwd)
|
||||
os.chdir(save_cwd)
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def get_unpack_formats():
|
||||
"""Returns a list of supported formats for unpacking.
|
||||
|
||||
Each element of the returned sequence is a tuple
|
||||
(name, extensions, description)
|
||||
"""
|
||||
formats = [(name, info[0], info[3]) for name, info in
|
||||
_UNPACK_FORMATS.items()]
|
||||
formats.sort()
|
||||
return formats
|
||||
|
||||
def _check_unpack_options(extensions, function, extra_args):
|
||||
"""Checks what gets registered as an unpacker."""
|
||||
# first make sure no other unpacker is registered for this extension
|
||||
existing_extensions = {}
|
||||
for name, info in _UNPACK_FORMATS.items():
|
||||
for ext in info[0]:
|
||||
existing_extensions[ext] = name
|
||||
|
||||
for extension in extensions:
|
||||
if extension in existing_extensions:
|
||||
msg = '%s is already registered for "%s"'
|
||||
raise RegistryError(msg % (extension,
|
||||
existing_extensions[extension]))
|
||||
|
||||
if not isinstance(function, Callable):
|
||||
raise TypeError('The registered function must be a callable')
|
||||
|
||||
|
||||
def register_unpack_format(name, extensions, function, extra_args=None,
|
||||
description=''):
|
||||
"""Registers an unpack format.
|
||||
|
||||
`name` is the name of the format. `extensions` is a list of extensions
|
||||
corresponding to the format.
|
||||
|
||||
`function` is the callable that will be
|
||||
used to unpack archives. The callable will receive archives to unpack.
|
||||
If it's unable to handle an archive, it needs to raise a ReadError
|
||||
exception.
|
||||
|
||||
If provided, `extra_args` is a sequence of
|
||||
(name, value) tuples that will be passed as arguments to the callable.
|
||||
description can be provided to describe the format, and will be returned
|
||||
by the get_unpack_formats() function.
|
||||
"""
|
||||
if extra_args is None:
|
||||
extra_args = []
|
||||
_check_unpack_options(extensions, function, extra_args)
|
||||
_UNPACK_FORMATS[name] = extensions, function, extra_args, description
|
||||
|
||||
def unregister_unpack_format(name):
|
||||
"""Removes the pack format from the registry."""
|
||||
del _UNPACK_FORMATS[name]
|
||||
|
||||
def _ensure_directory(path):
|
||||
"""Ensure that the parent directory of `path` exists"""
|
||||
dirname = os.path.dirname(path)
|
||||
if not os.path.isdir(dirname):
|
||||
os.makedirs(dirname)
|
||||
|
||||
def _unpack_zipfile(filename, extract_dir):
|
||||
"""Unpack zip `filename` to `extract_dir`
|
||||
"""
|
||||
try:
|
||||
import zipfile
|
||||
except ImportError:
|
||||
raise ReadError('zlib not supported, cannot unpack this archive.')
|
||||
|
||||
if not zipfile.is_zipfile(filename):
|
||||
raise ReadError("%s is not a zip file" % filename)
|
||||
|
||||
zip = zipfile.ZipFile(filename)
|
||||
try:
|
||||
for info in zip.infolist():
|
||||
name = info.filename
|
||||
|
||||
# don't extract absolute paths or ones with .. in them
|
||||
if name.startswith('/') or '..' in name:
|
||||
continue
|
||||
|
||||
target = os.path.join(extract_dir, *name.split('/'))
|
||||
if not target:
|
||||
continue
|
||||
|
||||
_ensure_directory(target)
|
||||
if not name.endswith('/'):
|
||||
# file
|
||||
data = zip.read(info.filename)
|
||||
f = open(target, 'wb')
|
||||
try:
|
||||
f.write(data)
|
||||
finally:
|
||||
f.close()
|
||||
del data
|
||||
finally:
|
||||
zip.close()
|
||||
|
||||
def _unpack_tarfile(filename, extract_dir):
|
||||
"""Unpack tar/tar.gz/tar.bz2 `filename` to `extract_dir`
|
||||
"""
|
||||
try:
|
||||
tarobj = tarfile.open(filename)
|
||||
except tarfile.TarError:
|
||||
raise ReadError(
|
||||
"%s is not a compressed or uncompressed tar file" % filename)
|
||||
try:
|
||||
tarobj.extractall(extract_dir)
|
||||
finally:
|
||||
tarobj.close()
|
||||
|
||||
_UNPACK_FORMATS = {
|
||||
'gztar': (['.tar.gz', '.tgz'], _unpack_tarfile, [], "gzip'ed tar-file"),
|
||||
'tar': (['.tar'], _unpack_tarfile, [], "uncompressed tar file"),
|
||||
'zip': (['.zip'], _unpack_zipfile, [], "ZIP file")
|
||||
}
|
||||
|
||||
if _BZ2_SUPPORTED:
|
||||
_UNPACK_FORMATS['bztar'] = (['.bz2'], _unpack_tarfile, [],
|
||||
"bzip2'ed tar-file")
|
||||
|
||||
def _find_unpack_format(filename):
|
||||
for name, info in _UNPACK_FORMATS.items():
|
||||
for extension in info[0]:
|
||||
if filename.endswith(extension):
|
||||
return name
|
||||
return None
|
||||
|
||||
def unpack_archive(filename, extract_dir=None, format=None):
|
||||
"""Unpack an archive.
|
||||
|
||||
`filename` is the name of the archive.
|
||||
|
||||
`extract_dir` is the name of the target directory, where the archive
|
||||
is unpacked. If not provided, the current working directory is used.
|
||||
|
||||
`format` is the archive format: one of "zip", "tar", or "gztar". Or any
|
||||
other registered format. If not provided, unpack_archive will use the
|
||||
filename extension and see if an unpacker was registered for that
|
||||
extension.
|
||||
|
||||
In case none is found, a ValueError is raised.
|
||||
"""
|
||||
if extract_dir is None:
|
||||
extract_dir = os.getcwd()
|
||||
|
||||
if format is not None:
|
||||
try:
|
||||
format_info = _UNPACK_FORMATS[format]
|
||||
except KeyError:
|
||||
raise ValueError("Unknown unpack format '{0}'".format(format))
|
||||
|
||||
func = format_info[1]
|
||||
func(filename, extract_dir, **dict(format_info[2]))
|
||||
else:
|
||||
# we need to look at the registered unpackers supported extensions
|
||||
format = _find_unpack_format(filename)
|
||||
if format is None:
|
||||
raise ReadError("Unknown archive format '{0}'".format(filename))
|
||||
|
||||
func = _UNPACK_FORMATS[format][1]
|
||||
kwargs = dict(_UNPACK_FORMATS[format][2])
|
||||
func(filename, extract_dir, **kwargs)
|
||||
@@ -1,84 +0,0 @@
|
||||
[posix_prefix]
|
||||
# Configuration directories. Some of these come straight out of the
|
||||
# configure script. They are for implementing the other variables, not to
|
||||
# be used directly in [resource_locations].
|
||||
confdir = /etc
|
||||
datadir = /usr/share
|
||||
libdir = /usr/lib
|
||||
statedir = /var
|
||||
# User resource directory
|
||||
local = ~/.local/{distribution.name}
|
||||
|
||||
stdlib = {base}/lib/python{py_version_short}
|
||||
platstdlib = {platbase}/lib/python{py_version_short}
|
||||
purelib = {base}/lib/python{py_version_short}/site-packages
|
||||
platlib = {platbase}/lib/python{py_version_short}/site-packages
|
||||
include = {base}/include/python{py_version_short}{abiflags}
|
||||
platinclude = {platbase}/include/python{py_version_short}{abiflags}
|
||||
data = {base}
|
||||
|
||||
[posix_home]
|
||||
stdlib = {base}/lib/python
|
||||
platstdlib = {base}/lib/python
|
||||
purelib = {base}/lib/python
|
||||
platlib = {base}/lib/python
|
||||
include = {base}/include/python
|
||||
platinclude = {base}/include/python
|
||||
scripts = {base}/bin
|
||||
data = {base}
|
||||
|
||||
[nt]
|
||||
stdlib = {base}/Lib
|
||||
platstdlib = {base}/Lib
|
||||
purelib = {base}/Lib/site-packages
|
||||
platlib = {base}/Lib/site-packages
|
||||
include = {base}/Include
|
||||
platinclude = {base}/Include
|
||||
scripts = {base}/Scripts
|
||||
data = {base}
|
||||
|
||||
[os2]
|
||||
stdlib = {base}/Lib
|
||||
platstdlib = {base}/Lib
|
||||
purelib = {base}/Lib/site-packages
|
||||
platlib = {base}/Lib/site-packages
|
||||
include = {base}/Include
|
||||
platinclude = {base}/Include
|
||||
scripts = {base}/Scripts
|
||||
data = {base}
|
||||
|
||||
[os2_home]
|
||||
stdlib = {userbase}/lib/python{py_version_short}
|
||||
platstdlib = {userbase}/lib/python{py_version_short}
|
||||
purelib = {userbase}/lib/python{py_version_short}/site-packages
|
||||
platlib = {userbase}/lib/python{py_version_short}/site-packages
|
||||
include = {userbase}/include/python{py_version_short}
|
||||
scripts = {userbase}/bin
|
||||
data = {userbase}
|
||||
|
||||
[nt_user]
|
||||
stdlib = {userbase}/Python{py_version_nodot}
|
||||
platstdlib = {userbase}/Python{py_version_nodot}
|
||||
purelib = {userbase}/Python{py_version_nodot}/site-packages
|
||||
platlib = {userbase}/Python{py_version_nodot}/site-packages
|
||||
include = {userbase}/Python{py_version_nodot}/Include
|
||||
scripts = {userbase}/Scripts
|
||||
data = {userbase}
|
||||
|
||||
[posix_user]
|
||||
stdlib = {userbase}/lib/python{py_version_short}
|
||||
platstdlib = {userbase}/lib/python{py_version_short}
|
||||
purelib = {userbase}/lib/python{py_version_short}/site-packages
|
||||
platlib = {userbase}/lib/python{py_version_short}/site-packages
|
||||
include = {userbase}/include/python{py_version_short}
|
||||
scripts = {userbase}/bin
|
||||
data = {userbase}
|
||||
|
||||
[osx_framework_user]
|
||||
stdlib = {userbase}/lib/python
|
||||
platstdlib = {userbase}/lib/python
|
||||
purelib = {userbase}/lib/python/site-packages
|
||||
platlib = {userbase}/lib/python/site-packages
|
||||
include = {userbase}/include
|
||||
scripts = {userbase}/bin
|
||||
data = {userbase}
|
||||
@@ -1,786 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright (C) 2012 The Python Software Foundation.
|
||||
# See LICENSE.txt and CONTRIBUTORS.txt.
|
||||
#
|
||||
"""Access to Python's configuration information."""
|
||||
|
||||
import codecs
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from os.path import pardir, realpath
|
||||
try:
|
||||
import configparser
|
||||
except ImportError:
|
||||
import ConfigParser as configparser
|
||||
|
||||
|
||||
__all__ = [
|
||||
'get_config_h_filename',
|
||||
'get_config_var',
|
||||
'get_config_vars',
|
||||
'get_makefile_filename',
|
||||
'get_path',
|
||||
'get_path_names',
|
||||
'get_paths',
|
||||
'get_platform',
|
||||
'get_python_version',
|
||||
'get_scheme_names',
|
||||
'parse_config_h',
|
||||
]
|
||||
|
||||
|
||||
def _safe_realpath(path):
|
||||
try:
|
||||
return realpath(path)
|
||||
except OSError:
|
||||
return path
|
||||
|
||||
|
||||
if sys.executable:
|
||||
_PROJECT_BASE = os.path.dirname(_safe_realpath(sys.executable))
|
||||
else:
|
||||
# sys.executable can be empty if argv[0] has been changed and Python is
|
||||
# unable to retrieve the real program name
|
||||
_PROJECT_BASE = _safe_realpath(os.getcwd())
|
||||
|
||||
if os.name == "nt" and "pcbuild" in _PROJECT_BASE[-8:].lower():
|
||||
_PROJECT_BASE = _safe_realpath(os.path.join(_PROJECT_BASE, pardir))
|
||||
# PC/VS7.1
|
||||
if os.name == "nt" and "\\pc\\v" in _PROJECT_BASE[-10:].lower():
|
||||
_PROJECT_BASE = _safe_realpath(os.path.join(_PROJECT_BASE, pardir, pardir))
|
||||
# PC/AMD64
|
||||
if os.name == "nt" and "\\pcbuild\\amd64" in _PROJECT_BASE[-14:].lower():
|
||||
_PROJECT_BASE = _safe_realpath(os.path.join(_PROJECT_BASE, pardir, pardir))
|
||||
|
||||
|
||||
def is_python_build():
|
||||
for fn in ("Setup.dist", "Setup.local"):
|
||||
if os.path.isfile(os.path.join(_PROJECT_BASE, "Modules", fn)):
|
||||
return True
|
||||
return False
|
||||
|
||||
_PYTHON_BUILD = is_python_build()
|
||||
|
||||
_cfg_read = False
|
||||
|
||||
def _ensure_cfg_read():
|
||||
global _cfg_read
|
||||
if not _cfg_read:
|
||||
from ..resources import finder
|
||||
backport_package = __name__.rsplit('.', 1)[0]
|
||||
_finder = finder(backport_package)
|
||||
_cfgfile = _finder.find('sysconfig.cfg')
|
||||
assert _cfgfile, 'sysconfig.cfg exists'
|
||||
with _cfgfile.as_stream() as s:
|
||||
_SCHEMES.readfp(s)
|
||||
if _PYTHON_BUILD:
|
||||
for scheme in ('posix_prefix', 'posix_home'):
|
||||
_SCHEMES.set(scheme, 'include', '{srcdir}/Include')
|
||||
_SCHEMES.set(scheme, 'platinclude', '{projectbase}/.')
|
||||
|
||||
_cfg_read = True
|
||||
|
||||
|
||||
_SCHEMES = configparser.RawConfigParser()
|
||||
_VAR_REPL = re.compile(r'\{([^{]*?)\}')
|
||||
|
||||
def _expand_globals(config):
|
||||
_ensure_cfg_read()
|
||||
if config.has_section('globals'):
|
||||
globals = config.items('globals')
|
||||
else:
|
||||
globals = tuple()
|
||||
|
||||
sections = config.sections()
|
||||
for section in sections:
|
||||
if section == 'globals':
|
||||
continue
|
||||
for option, value in globals:
|
||||
if config.has_option(section, option):
|
||||
continue
|
||||
config.set(section, option, value)
|
||||
config.remove_section('globals')
|
||||
|
||||
# now expanding local variables defined in the cfg file
|
||||
#
|
||||
for section in config.sections():
|
||||
variables = dict(config.items(section))
|
||||
|
||||
def _replacer(matchobj):
|
||||
name = matchobj.group(1)
|
||||
if name in variables:
|
||||
return variables[name]
|
||||
return matchobj.group(0)
|
||||
|
||||
for option, value in config.items(section):
|
||||
config.set(section, option, _VAR_REPL.sub(_replacer, value))
|
||||
|
||||
#_expand_globals(_SCHEMES)
|
||||
|
||||
_PY_VERSION = '%s.%s.%s' % sys.version_info[:3]
|
||||
_PY_VERSION_SHORT = '%s.%s' % sys.version_info[:2]
|
||||
_PY_VERSION_SHORT_NO_DOT = '%s%s' % sys.version_info[:2]
|
||||
_PREFIX = os.path.normpath(sys.prefix)
|
||||
_EXEC_PREFIX = os.path.normpath(sys.exec_prefix)
|
||||
_CONFIG_VARS = None
|
||||
_USER_BASE = None
|
||||
|
||||
|
||||
def _subst_vars(path, local_vars):
|
||||
"""In the string `path`, replace tokens like {some.thing} with the
|
||||
corresponding value from the map `local_vars`.
|
||||
|
||||
If there is no corresponding value, leave the token unchanged.
|
||||
"""
|
||||
def _replacer(matchobj):
|
||||
name = matchobj.group(1)
|
||||
if name in local_vars:
|
||||
return local_vars[name]
|
||||
elif name in os.environ:
|
||||
return os.environ[name]
|
||||
return matchobj.group(0)
|
||||
return _VAR_REPL.sub(_replacer, path)
|
||||
|
||||
|
||||
def _extend_dict(target_dict, other_dict):
|
||||
target_keys = target_dict.keys()
|
||||
for key, value in other_dict.items():
|
||||
if key in target_keys:
|
||||
continue
|
||||
target_dict[key] = value
|
||||
|
||||
|
||||
def _expand_vars(scheme, vars):
|
||||
res = {}
|
||||
if vars is None:
|
||||
vars = {}
|
||||
_extend_dict(vars, get_config_vars())
|
||||
|
||||
for key, value in _SCHEMES.items(scheme):
|
||||
if os.name in ('posix', 'nt'):
|
||||
value = os.path.expanduser(value)
|
||||
res[key] = os.path.normpath(_subst_vars(value, vars))
|
||||
return res
|
||||
|
||||
|
||||
def format_value(value, vars):
|
||||
def _replacer(matchobj):
|
||||
name = matchobj.group(1)
|
||||
if name in vars:
|
||||
return vars[name]
|
||||
return matchobj.group(0)
|
||||
return _VAR_REPL.sub(_replacer, value)
|
||||
|
||||
|
||||
def _get_default_scheme():
|
||||
if os.name == 'posix':
|
||||
# the default scheme for posix is posix_prefix
|
||||
return 'posix_prefix'
|
||||
return os.name
|
||||
|
||||
|
||||
def _getuserbase():
|
||||
env_base = os.environ.get("PYTHONUSERBASE", None)
|
||||
|
||||
def joinuser(*args):
|
||||
return os.path.expanduser(os.path.join(*args))
|
||||
|
||||
# what about 'os2emx', 'riscos' ?
|
||||
if os.name == "nt":
|
||||
base = os.environ.get("APPDATA") or "~"
|
||||
if env_base:
|
||||
return env_base
|
||||
else:
|
||||
return joinuser(base, "Python")
|
||||
|
||||
if sys.platform == "darwin":
|
||||
framework = get_config_var("PYTHONFRAMEWORK")
|
||||
if framework:
|
||||
if env_base:
|
||||
return env_base
|
||||
else:
|
||||
return joinuser("~", "Library", framework, "%d.%d" %
|
||||
sys.version_info[:2])
|
||||
|
||||
if env_base:
|
||||
return env_base
|
||||
else:
|
||||
return joinuser("~", ".local")
|
||||
|
||||
|
||||
def _parse_makefile(filename, vars=None):
|
||||
"""Parse a Makefile-style file.
|
||||
|
||||
A dictionary containing name/value pairs is returned. If an
|
||||
optional dictionary is passed in as the second argument, it is
|
||||
used instead of a new dictionary.
|
||||
"""
|
||||
# Regexes needed for parsing Makefile (and similar syntaxes,
|
||||
# like old-style Setup files).
|
||||
_variable_rx = re.compile(r"([a-zA-Z][a-zA-Z0-9_]+)\s*=\s*(.*)")
|
||||
_findvar1_rx = re.compile(r"\$\(([A-Za-z][A-Za-z0-9_]*)\)")
|
||||
_findvar2_rx = re.compile(r"\${([A-Za-z][A-Za-z0-9_]*)}")
|
||||
|
||||
if vars is None:
|
||||
vars = {}
|
||||
done = {}
|
||||
notdone = {}
|
||||
|
||||
with codecs.open(filename, encoding='utf-8', errors="surrogateescape") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
for line in lines:
|
||||
if line.startswith('#') or line.strip() == '':
|
||||
continue
|
||||
m = _variable_rx.match(line)
|
||||
if m:
|
||||
n, v = m.group(1, 2)
|
||||
v = v.strip()
|
||||
# `$$' is a literal `$' in make
|
||||
tmpv = v.replace('$$', '')
|
||||
|
||||
if "$" in tmpv:
|
||||
notdone[n] = v
|
||||
else:
|
||||
try:
|
||||
v = int(v)
|
||||
except ValueError:
|
||||
# insert literal `$'
|
||||
done[n] = v.replace('$$', '$')
|
||||
else:
|
||||
done[n] = v
|
||||
|
||||
# do variable interpolation here
|
||||
variables = list(notdone.keys())
|
||||
|
||||
# Variables with a 'PY_' prefix in the makefile. These need to
|
||||
# be made available without that prefix through sysconfig.
|
||||
# Special care is needed to ensure that variable expansion works, even
|
||||
# if the expansion uses the name without a prefix.
|
||||
renamed_variables = ('CFLAGS', 'LDFLAGS', 'CPPFLAGS')
|
||||
|
||||
while len(variables) > 0:
|
||||
for name in tuple(variables):
|
||||
value = notdone[name]
|
||||
m = _findvar1_rx.search(value) or _findvar2_rx.search(value)
|
||||
if m is not None:
|
||||
n = m.group(1)
|
||||
found = True
|
||||
if n in done:
|
||||
item = str(done[n])
|
||||
elif n in notdone:
|
||||
# get it on a subsequent round
|
||||
found = False
|
||||
elif n in os.environ:
|
||||
# do it like make: fall back to environment
|
||||
item = os.environ[n]
|
||||
|
||||
elif n in renamed_variables:
|
||||
if (name.startswith('PY_') and
|
||||
name[3:] in renamed_variables):
|
||||
item = ""
|
||||
|
||||
elif 'PY_' + n in notdone:
|
||||
found = False
|
||||
|
||||
else:
|
||||
item = str(done['PY_' + n])
|
||||
|
||||
else:
|
||||
done[n] = item = ""
|
||||
|
||||
if found:
|
||||
after = value[m.end():]
|
||||
value = value[:m.start()] + item + after
|
||||
if "$" in after:
|
||||
notdone[name] = value
|
||||
else:
|
||||
try:
|
||||
value = int(value)
|
||||
except ValueError:
|
||||
done[name] = value.strip()
|
||||
else:
|
||||
done[name] = value
|
||||
variables.remove(name)
|
||||
|
||||
if (name.startswith('PY_') and
|
||||
name[3:] in renamed_variables):
|
||||
|
||||
name = name[3:]
|
||||
if name not in done:
|
||||
done[name] = value
|
||||
|
||||
else:
|
||||
# bogus variable reference (e.g. "prefix=$/opt/python");
|
||||
# just drop it since we can't deal
|
||||
done[name] = value
|
||||
variables.remove(name)
|
||||
|
||||
# strip spurious spaces
|
||||
for k, v in done.items():
|
||||
if isinstance(v, str):
|
||||
done[k] = v.strip()
|
||||
|
||||
# save the results in the global dictionary
|
||||
vars.update(done)
|
||||
return vars
|
||||
|
||||
|
||||
def get_makefile_filename():
|
||||
"""Return the path of the Makefile."""
|
||||
if _PYTHON_BUILD:
|
||||
return os.path.join(_PROJECT_BASE, "Makefile")
|
||||
if hasattr(sys, 'abiflags'):
|
||||
config_dir_name = 'config-%s%s' % (_PY_VERSION_SHORT, sys.abiflags)
|
||||
else:
|
||||
config_dir_name = 'config'
|
||||
return os.path.join(get_path('stdlib'), config_dir_name, 'Makefile')
|
||||
|
||||
|
||||
def _init_posix(vars):
|
||||
"""Initialize the module as appropriate for POSIX systems."""
|
||||
# load the installed Makefile:
|
||||
makefile = get_makefile_filename()
|
||||
try:
|
||||
_parse_makefile(makefile, vars)
|
||||
except IOError as e:
|
||||
msg = "invalid Python installation: unable to open %s" % makefile
|
||||
if hasattr(e, "strerror"):
|
||||
msg = msg + " (%s)" % e.strerror
|
||||
raise IOError(msg)
|
||||
# load the installed pyconfig.h:
|
||||
config_h = get_config_h_filename()
|
||||
try:
|
||||
with open(config_h) as f:
|
||||
parse_config_h(f, vars)
|
||||
except IOError as e:
|
||||
msg = "invalid Python installation: unable to open %s" % config_h
|
||||
if hasattr(e, "strerror"):
|
||||
msg = msg + " (%s)" % e.strerror
|
||||
raise IOError(msg)
|
||||
# On AIX, there are wrong paths to the linker scripts in the Makefile
|
||||
# -- these paths are relative to the Python source, but when installed
|
||||
# the scripts are in another directory.
|
||||
if _PYTHON_BUILD:
|
||||
vars['LDSHARED'] = vars['BLDSHARED']
|
||||
|
||||
|
||||
def _init_non_posix(vars):
|
||||
"""Initialize the module as appropriate for NT"""
|
||||
# set basic install directories
|
||||
vars['LIBDEST'] = get_path('stdlib')
|
||||
vars['BINLIBDEST'] = get_path('platstdlib')
|
||||
vars['INCLUDEPY'] = get_path('include')
|
||||
vars['SO'] = '.pyd'
|
||||
vars['EXE'] = '.exe'
|
||||
vars['VERSION'] = _PY_VERSION_SHORT_NO_DOT
|
||||
vars['BINDIR'] = os.path.dirname(_safe_realpath(sys.executable))
|
||||
|
||||
#
|
||||
# public APIs
|
||||
#
|
||||
|
||||
|
||||
def parse_config_h(fp, vars=None):
|
||||
"""Parse a config.h-style file.
|
||||
|
||||
A dictionary containing name/value pairs is returned. If an
|
||||
optional dictionary is passed in as the second argument, it is
|
||||
used instead of a new dictionary.
|
||||
"""
|
||||
if vars is None:
|
||||
vars = {}
|
||||
define_rx = re.compile("#define ([A-Z][A-Za-z0-9_]+) (.*)\n")
|
||||
undef_rx = re.compile("/[*] #undef ([A-Z][A-Za-z0-9_]+) [*]/\n")
|
||||
|
||||
while True:
|
||||
line = fp.readline()
|
||||
if not line:
|
||||
break
|
||||
m = define_rx.match(line)
|
||||
if m:
|
||||
n, v = m.group(1, 2)
|
||||
try:
|
||||
v = int(v)
|
||||
except ValueError:
|
||||
pass
|
||||
vars[n] = v
|
||||
else:
|
||||
m = undef_rx.match(line)
|
||||
if m:
|
||||
vars[m.group(1)] = 0
|
||||
return vars
|
||||
|
||||
|
||||
def get_config_h_filename():
|
||||
"""Return the path of pyconfig.h."""
|
||||
if _PYTHON_BUILD:
|
||||
if os.name == "nt":
|
||||
inc_dir = os.path.join(_PROJECT_BASE, "PC")
|
||||
else:
|
||||
inc_dir = _PROJECT_BASE
|
||||
else:
|
||||
inc_dir = get_path('platinclude')
|
||||
return os.path.join(inc_dir, 'pyconfig.h')
|
||||
|
||||
|
||||
def get_scheme_names():
|
||||
"""Return a tuple containing the schemes names."""
|
||||
return tuple(sorted(_SCHEMES.sections()))
|
||||
|
||||
|
||||
def get_path_names():
|
||||
"""Return a tuple containing the paths names."""
|
||||
# xxx see if we want a static list
|
||||
return _SCHEMES.options('posix_prefix')
|
||||
|
||||
|
||||
def get_paths(scheme=_get_default_scheme(), vars=None, expand=True):
|
||||
"""Return a mapping containing an install scheme.
|
||||
|
||||
``scheme`` is the install scheme name. If not provided, it will
|
||||
return the default scheme for the current platform.
|
||||
"""
|
||||
_ensure_cfg_read()
|
||||
if expand:
|
||||
return _expand_vars(scheme, vars)
|
||||
else:
|
||||
return dict(_SCHEMES.items(scheme))
|
||||
|
||||
|
||||
def get_path(name, scheme=_get_default_scheme(), vars=None, expand=True):
|
||||
"""Return a path corresponding to the scheme.
|
||||
|
||||
``scheme`` is the install scheme name.
|
||||
"""
|
||||
return get_paths(scheme, vars, expand)[name]
|
||||
|
||||
|
||||
def get_config_vars(*args):
|
||||
"""With no arguments, return a dictionary of all configuration
|
||||
variables relevant for the current platform.
|
||||
|
||||
On Unix, this means every variable defined in Python's installed Makefile;
|
||||
On Windows and Mac OS it's a much smaller set.
|
||||
|
||||
With arguments, return a list of values that result from looking up
|
||||
each argument in the configuration variable dictionary.
|
||||
"""
|
||||
global _CONFIG_VARS
|
||||
if _CONFIG_VARS is None:
|
||||
_CONFIG_VARS = {}
|
||||
# Normalized versions of prefix and exec_prefix are handy to have;
|
||||
# in fact, these are the standard versions used most places in the
|
||||
# distutils2 module.
|
||||
_CONFIG_VARS['prefix'] = _PREFIX
|
||||
_CONFIG_VARS['exec_prefix'] = _EXEC_PREFIX
|
||||
_CONFIG_VARS['py_version'] = _PY_VERSION
|
||||
_CONFIG_VARS['py_version_short'] = _PY_VERSION_SHORT
|
||||
_CONFIG_VARS['py_version_nodot'] = _PY_VERSION[0] + _PY_VERSION[2]
|
||||
_CONFIG_VARS['base'] = _PREFIX
|
||||
_CONFIG_VARS['platbase'] = _EXEC_PREFIX
|
||||
_CONFIG_VARS['projectbase'] = _PROJECT_BASE
|
||||
try:
|
||||
_CONFIG_VARS['abiflags'] = sys.abiflags
|
||||
except AttributeError:
|
||||
# sys.abiflags may not be defined on all platforms.
|
||||
_CONFIG_VARS['abiflags'] = ''
|
||||
|
||||
if os.name in ('nt', 'os2'):
|
||||
_init_non_posix(_CONFIG_VARS)
|
||||
if os.name == 'posix':
|
||||
_init_posix(_CONFIG_VARS)
|
||||
# Setting 'userbase' is done below the call to the
|
||||
# init function to enable using 'get_config_var' in
|
||||
# the init-function.
|
||||
if sys.version >= '2.6':
|
||||
_CONFIG_VARS['userbase'] = _getuserbase()
|
||||
|
||||
if 'srcdir' not in _CONFIG_VARS:
|
||||
_CONFIG_VARS['srcdir'] = _PROJECT_BASE
|
||||
else:
|
||||
_CONFIG_VARS['srcdir'] = _safe_realpath(_CONFIG_VARS['srcdir'])
|
||||
|
||||
# Convert srcdir into an absolute path if it appears necessary.
|
||||
# Normally it is relative to the build directory. However, during
|
||||
# testing, for example, we might be running a non-installed python
|
||||
# from a different directory.
|
||||
if _PYTHON_BUILD and os.name == "posix":
|
||||
base = _PROJECT_BASE
|
||||
try:
|
||||
cwd = os.getcwd()
|
||||
except OSError:
|
||||
cwd = None
|
||||
if (not os.path.isabs(_CONFIG_VARS['srcdir']) and
|
||||
base != cwd):
|
||||
# srcdir is relative and we are not in the same directory
|
||||
# as the executable. Assume executable is in the build
|
||||
# directory and make srcdir absolute.
|
||||
srcdir = os.path.join(base, _CONFIG_VARS['srcdir'])
|
||||
_CONFIG_VARS['srcdir'] = os.path.normpath(srcdir)
|
||||
|
||||
if sys.platform == 'darwin':
|
||||
kernel_version = os.uname()[2] # Kernel version (8.4.3)
|
||||
major_version = int(kernel_version.split('.')[0])
|
||||
|
||||
if major_version < 8:
|
||||
# On Mac OS X before 10.4, check if -arch and -isysroot
|
||||
# are in CFLAGS or LDFLAGS and remove them if they are.
|
||||
# This is needed when building extensions on a 10.3 system
|
||||
# using a universal build of python.
|
||||
for key in ('LDFLAGS', 'BASECFLAGS',
|
||||
# a number of derived variables. These need to be
|
||||
# patched up as well.
|
||||
'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
|
||||
flags = _CONFIG_VARS[key]
|
||||
flags = re.sub(r'-arch\s+\w+\s', ' ', flags)
|
||||
flags = re.sub('-isysroot [^ \t]*', ' ', flags)
|
||||
_CONFIG_VARS[key] = flags
|
||||
else:
|
||||
# Allow the user to override the architecture flags using
|
||||
# an environment variable.
|
||||
# NOTE: This name was introduced by Apple in OSX 10.5 and
|
||||
# is used by several scripting languages distributed with
|
||||
# that OS release.
|
||||
if 'ARCHFLAGS' in os.environ:
|
||||
arch = os.environ['ARCHFLAGS']
|
||||
for key in ('LDFLAGS', 'BASECFLAGS',
|
||||
# a number of derived variables. These need to be
|
||||
# patched up as well.
|
||||
'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
|
||||
|
||||
flags = _CONFIG_VARS[key]
|
||||
flags = re.sub(r'-arch\s+\w+\s', ' ', flags)
|
||||
flags = flags + ' ' + arch
|
||||
_CONFIG_VARS[key] = flags
|
||||
|
||||
# If we're on OSX 10.5 or later and the user tries to
|
||||
# compiles an extension using an SDK that is not present
|
||||
# on the current machine it is better to not use an SDK
|
||||
# than to fail.
|
||||
#
|
||||
# The major usecase for this is users using a Python.org
|
||||
# binary installer on OSX 10.6: that installer uses
|
||||
# the 10.4u SDK, but that SDK is not installed by default
|
||||
# when you install Xcode.
|
||||
#
|
||||
CFLAGS = _CONFIG_VARS.get('CFLAGS', '')
|
||||
m = re.search(r'-isysroot\s+(\S+)', CFLAGS)
|
||||
if m is not None:
|
||||
sdk = m.group(1)
|
||||
if not os.path.exists(sdk):
|
||||
for key in ('LDFLAGS', 'BASECFLAGS',
|
||||
# a number of derived variables. These need to be
|
||||
# patched up as well.
|
||||
'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
|
||||
|
||||
flags = _CONFIG_VARS[key]
|
||||
flags = re.sub(r'-isysroot\s+\S+(\s|$)', ' ', flags)
|
||||
_CONFIG_VARS[key] = flags
|
||||
|
||||
if args:
|
||||
vals = []
|
||||
for name in args:
|
||||
vals.append(_CONFIG_VARS.get(name))
|
||||
return vals
|
||||
else:
|
||||
return _CONFIG_VARS
|
||||
|
||||
|
||||
def get_config_var(name):
|
||||
"""Return the value of a single variable using the dictionary returned by
|
||||
'get_config_vars()'.
|
||||
|
||||
Equivalent to get_config_vars().get(name)
|
||||
"""
|
||||
return get_config_vars().get(name)
|
||||
|
||||
|
||||
def get_platform():
|
||||
"""Return a string that identifies the current platform.
|
||||
|
||||
This is used mainly to distinguish platform-specific build directories and
|
||||
platform-specific built distributions. Typically includes the OS name
|
||||
and version and the architecture (as supplied by 'os.uname()'),
|
||||
although the exact information included depends on the OS; eg. for IRIX
|
||||
the architecture isn't particularly important (IRIX only runs on SGI
|
||||
hardware), but for Linux the kernel version isn't particularly
|
||||
important.
|
||||
|
||||
Examples of returned values:
|
||||
linux-i586
|
||||
linux-alpha (?)
|
||||
solaris-2.6-sun4u
|
||||
irix-5.3
|
||||
irix64-6.2
|
||||
|
||||
Windows will return one of:
|
||||
win-amd64 (64bit Windows on AMD64 (aka x86_64, Intel64, EM64T, etc)
|
||||
win-ia64 (64bit Windows on Itanium)
|
||||
win32 (all others - specifically, sys.platform is returned)
|
||||
|
||||
For other non-POSIX platforms, currently just returns 'sys.platform'.
|
||||
"""
|
||||
if os.name == 'nt':
|
||||
# sniff sys.version for architecture.
|
||||
prefix = " bit ("
|
||||
i = sys.version.find(prefix)
|
||||
if i == -1:
|
||||
return sys.platform
|
||||
j = sys.version.find(")", i)
|
||||
look = sys.version[i+len(prefix):j].lower()
|
||||
if look == 'amd64':
|
||||
return 'win-amd64'
|
||||
if look == 'itanium':
|
||||
return 'win-ia64'
|
||||
return sys.platform
|
||||
|
||||
if os.name != "posix" or not hasattr(os, 'uname'):
|
||||
# XXX what about the architecture? NT is Intel or Alpha,
|
||||
# Mac OS is M68k or PPC, etc.
|
||||
return sys.platform
|
||||
|
||||
# Try to distinguish various flavours of Unix
|
||||
osname, host, release, version, machine = os.uname()
|
||||
|
||||
# Convert the OS name to lowercase, remove '/' characters
|
||||
# (to accommodate BSD/OS), and translate spaces (for "Power Macintosh")
|
||||
osname = osname.lower().replace('/', '')
|
||||
machine = machine.replace(' ', '_')
|
||||
machine = machine.replace('/', '-')
|
||||
|
||||
if osname[:5] == "linux":
|
||||
# At least on Linux/Intel, 'machine' is the processor --
|
||||
# i386, etc.
|
||||
# XXX what about Alpha, SPARC, etc?
|
||||
return "%s-%s" % (osname, machine)
|
||||
elif osname[:5] == "sunos":
|
||||
if release[0] >= "5": # SunOS 5 == Solaris 2
|
||||
osname = "solaris"
|
||||
release = "%d.%s" % (int(release[0]) - 3, release[2:])
|
||||
# fall through to standard osname-release-machine representation
|
||||
elif osname[:4] == "irix": # could be "irix64"!
|
||||
return "%s-%s" % (osname, release)
|
||||
elif osname[:3] == "aix":
|
||||
return "%s-%s.%s" % (osname, version, release)
|
||||
elif osname[:6] == "cygwin":
|
||||
osname = "cygwin"
|
||||
rel_re = re.compile(r'[\d.]+')
|
||||
m = rel_re.match(release)
|
||||
if m:
|
||||
release = m.group()
|
||||
elif osname[:6] == "darwin":
|
||||
#
|
||||
# For our purposes, we'll assume that the system version from
|
||||
# distutils' perspective is what MACOSX_DEPLOYMENT_TARGET is set
|
||||
# to. This makes the compatibility story a bit more sane because the
|
||||
# machine is going to compile and link as if it were
|
||||
# MACOSX_DEPLOYMENT_TARGET.
|
||||
cfgvars = get_config_vars()
|
||||
macver = cfgvars.get('MACOSX_DEPLOYMENT_TARGET')
|
||||
|
||||
if True:
|
||||
# Always calculate the release of the running machine,
|
||||
# needed to determine if we can build fat binaries or not.
|
||||
|
||||
macrelease = macver
|
||||
# Get the system version. Reading this plist is a documented
|
||||
# way to get the system version (see the documentation for
|
||||
# the Gestalt Manager)
|
||||
try:
|
||||
f = open('/System/Library/CoreServices/SystemVersion.plist')
|
||||
except IOError:
|
||||
# We're on a plain darwin box, fall back to the default
|
||||
# behaviour.
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
m = re.search(r'<key>ProductUserVisibleVersion</key>\s*'
|
||||
r'<string>(.*?)</string>', f.read())
|
||||
finally:
|
||||
f.close()
|
||||
if m is not None:
|
||||
macrelease = '.'.join(m.group(1).split('.')[:2])
|
||||
# else: fall back to the default behaviour
|
||||
|
||||
if not macver:
|
||||
macver = macrelease
|
||||
|
||||
if macver:
|
||||
release = macver
|
||||
osname = "macosx"
|
||||
|
||||
if ((macrelease + '.') >= '10.4.' and
|
||||
'-arch' in get_config_vars().get('CFLAGS', '').strip()):
|
||||
# The universal build will build fat binaries, but not on
|
||||
# systems before 10.4
|
||||
#
|
||||
# Try to detect 4-way universal builds, those have machine-type
|
||||
# 'universal' instead of 'fat'.
|
||||
|
||||
machine = 'fat'
|
||||
cflags = get_config_vars().get('CFLAGS')
|
||||
|
||||
archs = re.findall(r'-arch\s+(\S+)', cflags)
|
||||
archs = tuple(sorted(set(archs)))
|
||||
|
||||
if len(archs) == 1:
|
||||
machine = archs[0]
|
||||
elif archs == ('i386', 'ppc'):
|
||||
machine = 'fat'
|
||||
elif archs == ('i386', 'x86_64'):
|
||||
machine = 'intel'
|
||||
elif archs == ('i386', 'ppc', 'x86_64'):
|
||||
machine = 'fat3'
|
||||
elif archs == ('ppc64', 'x86_64'):
|
||||
machine = 'fat64'
|
||||
elif archs == ('i386', 'ppc', 'ppc64', 'x86_64'):
|
||||
machine = 'universal'
|
||||
else:
|
||||
raise ValueError(
|
||||
"Don't know machine value for archs=%r" % (archs,))
|
||||
|
||||
elif machine == 'i386':
|
||||
# On OSX the machine type returned by uname is always the
|
||||
# 32-bit variant, even if the executable architecture is
|
||||
# the 64-bit variant
|
||||
if sys.maxsize >= 2**32:
|
||||
machine = 'x86_64'
|
||||
|
||||
elif machine in ('PowerPC', 'Power_Macintosh'):
|
||||
# Pick a sane name for the PPC architecture.
|
||||
# See 'i386' case
|
||||
if sys.maxsize >= 2**32:
|
||||
machine = 'ppc64'
|
||||
else:
|
||||
machine = 'ppc'
|
||||
|
||||
return "%s-%s-%s" % (osname, release, machine)
|
||||
|
||||
|
||||
def get_python_version():
|
||||
return _PY_VERSION_SHORT
|
||||
|
||||
|
||||
def _print_dict(title, data):
|
||||
for index, (key, value) in enumerate(sorted(data.items())):
|
||||
if index == 0:
|
||||
print('%s: ' % (title))
|
||||
print('\t%s = "%s"' % (key, value))
|
||||
|
||||
|
||||
def _main():
|
||||
"""Display all information sysconfig detains."""
|
||||
print('Platform: "%s"' % get_platform())
|
||||
print('Python version: "%s"' % get_python_version())
|
||||
print('Current installation scheme: "%s"' % _get_default_scheme())
|
||||
print()
|
||||
_print_dict('Paths', get_paths())
|
||||
print()
|
||||
_print_dict('Variables', get_config_vars())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
_main()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -22,7 +22,6 @@ if sys.version_info[0] < 3: # pragma: no cover
|
||||
from types import FileType as file_type
|
||||
import __builtin__ as builtins
|
||||
import ConfigParser as configparser
|
||||
from ._backport import shutil
|
||||
from urlparse import urlparse, urlunparse, urljoin, urlsplit, urlunsplit
|
||||
from urllib import (urlretrieve, quote as _quote, unquote, url2pathname,
|
||||
pathname2url, ContentTooShortError, splittype)
|
||||
@@ -48,17 +47,18 @@ if sys.version_info[0] < 3: # pragma: no cover
|
||||
from itertools import ifilter as filter
|
||||
from itertools import ifilterfalse as filterfalse
|
||||
|
||||
_userprog = None
|
||||
def splituser(host):
|
||||
"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
|
||||
global _userprog
|
||||
if _userprog is None:
|
||||
import re
|
||||
_userprog = re.compile('^(.*)@(.*)$')
|
||||
# Leaving this around for now, in case it needs resurrecting in some way
|
||||
# _userprog = None
|
||||
# def splituser(host):
|
||||
# """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
|
||||
# global _userprog
|
||||
# if _userprog is None:
|
||||
# import re
|
||||
# _userprog = re.compile('^(.*)@(.*)$')
|
||||
|
||||
match = _userprog.match(host)
|
||||
if match: return match.group(1, 2)
|
||||
return None, host
|
||||
# match = _userprog.match(host)
|
||||
# if match: return match.group(1, 2)
|
||||
# return None, host
|
||||
|
||||
else: # pragma: no cover
|
||||
from io import StringIO
|
||||
@@ -68,7 +68,7 @@ else: # pragma: no cover
|
||||
import builtins
|
||||
import configparser
|
||||
import shutil
|
||||
from urllib.parse import (urlparse, urlunparse, urljoin, splituser, quote,
|
||||
from urllib.parse import (urlparse, urlunparse, urljoin, quote,
|
||||
unquote, urlsplit, urlunsplit, splittype)
|
||||
from urllib.request import (urlopen, urlretrieve, Request, url2pathname,
|
||||
pathname2url,
|
||||
@@ -88,6 +88,7 @@ else: # pragma: no cover
|
||||
from itertools import filterfalse
|
||||
filter = filter
|
||||
|
||||
|
||||
try:
|
||||
from ssl import match_hostname, CertificateError
|
||||
except ImportError: # pragma: no cover
|
||||
@@ -311,10 +312,8 @@ except ImportError: # pragma: no cover
|
||||
return 'IronPython'
|
||||
return 'CPython'
|
||||
|
||||
try:
|
||||
import sysconfig
|
||||
except ImportError: # pragma: no cover
|
||||
from ._backport import sysconfig
|
||||
import shutil
|
||||
import sysconfig
|
||||
|
||||
try:
|
||||
callable = callable
|
||||
@@ -616,18 +615,15 @@ except ImportError: # pragma: no cover
|
||||
try:
|
||||
from importlib.util import cache_from_source # Python >= 3.4
|
||||
except ImportError: # pragma: no cover
|
||||
try:
|
||||
from imp import cache_from_source
|
||||
except ImportError: # pragma: no cover
|
||||
def cache_from_source(path, debug_override=None):
|
||||
assert path.endswith('.py')
|
||||
if debug_override is None:
|
||||
debug_override = __debug__
|
||||
if debug_override:
|
||||
suffix = 'c'
|
||||
else:
|
||||
suffix = 'o'
|
||||
return path + suffix
|
||||
def cache_from_source(path, debug_override=None):
|
||||
assert path.endswith('.py')
|
||||
if debug_override is None:
|
||||
debug_override = __debug__
|
||||
if debug_override:
|
||||
suffix = 'c'
|
||||
else:
|
||||
suffix = 'o'
|
||||
return path + suffix
|
||||
|
||||
try:
|
||||
from collections import OrderedDict
|
||||
|
||||
@@ -132,29 +132,35 @@ class DistributionPath(object):
|
||||
r = finder.find(entry)
|
||||
if not r or r.path in seen:
|
||||
continue
|
||||
if self._include_dist and entry.endswith(DISTINFO_EXT):
|
||||
possible_filenames = [METADATA_FILENAME,
|
||||
WHEEL_METADATA_FILENAME,
|
||||
LEGACY_METADATA_FILENAME]
|
||||
for metadata_filename in possible_filenames:
|
||||
metadata_path = posixpath.join(entry, metadata_filename)
|
||||
pydist = finder.find(metadata_path)
|
||||
if pydist:
|
||||
break
|
||||
else:
|
||||
continue
|
||||
try:
|
||||
if self._include_dist and entry.endswith(DISTINFO_EXT):
|
||||
possible_filenames = [METADATA_FILENAME,
|
||||
WHEEL_METADATA_FILENAME,
|
||||
LEGACY_METADATA_FILENAME]
|
||||
for metadata_filename in possible_filenames:
|
||||
metadata_path = posixpath.join(entry, metadata_filename)
|
||||
pydist = finder.find(metadata_path)
|
||||
if pydist:
|
||||
break
|
||||
else:
|
||||
continue
|
||||
|
||||
with contextlib.closing(pydist.as_stream()) as stream:
|
||||
metadata = Metadata(fileobj=stream, scheme='legacy')
|
||||
logger.debug('Found %s', r.path)
|
||||
seen.add(r.path)
|
||||
yield new_dist_class(r.path, metadata=metadata,
|
||||
env=self)
|
||||
elif self._include_egg and entry.endswith(('.egg-info',
|
||||
'.egg')):
|
||||
logger.debug('Found %s', r.path)
|
||||
seen.add(r.path)
|
||||
yield old_dist_class(r.path, self)
|
||||
with contextlib.closing(pydist.as_stream()) as stream:
|
||||
metadata = Metadata(fileobj=stream, scheme='legacy')
|
||||
logger.debug('Found %s', r.path)
|
||||
seen.add(r.path)
|
||||
yield new_dist_class(r.path, metadata=metadata,
|
||||
env=self)
|
||||
elif self._include_egg and entry.endswith(('.egg-info',
|
||||
'.egg')):
|
||||
logger.debug('Found %s', r.path)
|
||||
seen.add(r.path)
|
||||
yield old_dist_class(r.path, self)
|
||||
except Exception as e:
|
||||
msg = 'Unable to read distribution at %s, perhaps due to bad metadata: %s'
|
||||
logger.warning(msg, r.path, e)
|
||||
import warnings
|
||||
warnings.warn(msg % (r.path, e), stacklevel=2)
|
||||
|
||||
def _generate_cache(self):
|
||||
"""
|
||||
@@ -379,8 +385,9 @@ class Distribution(object):
|
||||
|
||||
def _get_requirements(self, req_attr):
|
||||
md = self.metadata
|
||||
logger.debug('Getting requirements from metadata %r', md.todict())
|
||||
reqts = getattr(md, req_attr)
|
||||
logger.debug('%s: got requirements %r from metadata: %r', self.name, req_attr,
|
||||
reqts)
|
||||
return set(md.get_requirements(reqts, extras=self.extras,
|
||||
env=self.context))
|
||||
|
||||
@@ -1308,22 +1315,26 @@ def get_required_dists(dists, dist):
|
||||
|
||||
:param dists: a list of distributions
|
||||
:param dist: a distribution, member of *dists* for which we are interested
|
||||
in finding the dependencies.
|
||||
"""
|
||||
if dist not in dists:
|
||||
raise DistlibException('given distribution %r is not a member '
|
||||
'of the list' % dist.name)
|
||||
graph = make_graph(dists)
|
||||
|
||||
req = [] # required distributions
|
||||
req = set() # required distributions
|
||||
todo = graph.adjacency_list[dist] # list of nodes we should inspect
|
||||
seen = set(t[0] for t in todo) # already added to todo
|
||||
|
||||
while todo:
|
||||
d = todo.pop()[0]
|
||||
req.append(d)
|
||||
for pred in graph.adjacency_list[d]:
|
||||
if pred not in req:
|
||||
req.add(d)
|
||||
pred_list = graph.adjacency_list[d]
|
||||
for pred in pred_list:
|
||||
d = pred[0]
|
||||
if d not in req and d not in seen:
|
||||
seen.add(d)
|
||||
todo.append(pred)
|
||||
|
||||
return req
|
||||
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ import subprocess
|
||||
import tempfile
|
||||
try:
|
||||
from threading import Thread
|
||||
except ImportError:
|
||||
except ImportError: # pragma: no cover
|
||||
from dummy_threading import Thread
|
||||
|
||||
from . import DistlibException
|
||||
@@ -104,7 +104,7 @@ class PackageIndex(object):
|
||||
pm.add_password(self.realm, netloc, self.username, self.password)
|
||||
self.password_handler = HTTPBasicAuthHandler(pm)
|
||||
|
||||
def register(self, metadata):
|
||||
def register(self, metadata): # pragma: no cover
|
||||
"""
|
||||
Register a distribution on PyPI, using the provided metadata.
|
||||
|
||||
@@ -142,8 +142,7 @@ class PackageIndex(object):
|
||||
logger.debug('%s: %s' % (name, s))
|
||||
stream.close()
|
||||
|
||||
def get_sign_command(self, filename, signer, sign_password,
|
||||
keystore=None):
|
||||
def get_sign_command(self, filename, signer, sign_password, keystore=None): # pragma: no cover
|
||||
"""
|
||||
Return a suitable command for signing a file.
|
||||
|
||||
@@ -206,7 +205,7 @@ class PackageIndex(object):
|
||||
t2.join()
|
||||
return p.returncode, stdout, stderr
|
||||
|
||||
def sign_file(self, filename, signer, sign_password, keystore=None):
|
||||
def sign_file(self, filename, signer, sign_password, keystore=None): # pragma: no cover
|
||||
"""
|
||||
Sign a file.
|
||||
|
||||
@@ -286,7 +285,7 @@ class PackageIndex(object):
|
||||
request = self.encode_request(d.items(), files)
|
||||
return self.send_request(request)
|
||||
|
||||
def upload_documentation(self, metadata, doc_dir):
|
||||
def upload_documentation(self, metadata, doc_dir): # pragma: no cover
|
||||
"""
|
||||
Upload documentation to the index.
|
||||
|
||||
@@ -499,7 +498,7 @@ class PackageIndex(object):
|
||||
}
|
||||
return Request(self.url, body, headers)
|
||||
|
||||
def search(self, terms, operator=None):
|
||||
def search(self, terms, operator=None): # pragma: no cover
|
||||
if isinstance(terms, string_types):
|
||||
terms = {'name': terms}
|
||||
rpc_proxy = ServerProxy(self.url, timeout=3.0)
|
||||
|
||||
@@ -633,7 +633,7 @@ class SimpleScrapingLocator(Locator):
|
||||
self._threads = []
|
||||
for i in range(self.num_workers):
|
||||
t = threading.Thread(target=self._fetch)
|
||||
t.setDaemon(True)
|
||||
t.daemon = True
|
||||
t.start()
|
||||
self._threads.append(t)
|
||||
|
||||
@@ -1053,9 +1053,9 @@ class AggregatingLocator(Locator):
|
||||
|
||||
|
||||
# We use a legacy scheme simply because most of the dists on PyPI use legacy
|
||||
# versions which don't conform to PEP 426 / PEP 440.
|
||||
# versions which don't conform to PEP 440.
|
||||
default_locator = AggregatingLocator(
|
||||
JSONLocator(),
|
||||
# JSONLocator(), # don't use as PEP 426 is withdrawn
|
||||
SimpleScrapingLocator('https://pypi.org/simple/',
|
||||
timeout=3.0),
|
||||
scheme='legacy')
|
||||
|
||||
@@ -13,19 +13,29 @@ Parser for the environment markers micro-language defined in PEP 508.
|
||||
# as ~= and === which aren't in Python, necessitating a different approach.
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import platform
|
||||
|
||||
from .compat import string_types
|
||||
from .util import in_venv, parse_marker
|
||||
from .version import NormalizedVersion as NV
|
||||
|
||||
__all__ = ['interpret']
|
||||
|
||||
_VERSION_PATTERN = re.compile(r'((\d+(\.\d+)*\w*)|\'(\d+(\.\d+)*\w*)\'|\"(\d+(\.\d+)*\w*)\")')
|
||||
|
||||
def _is_literal(o):
|
||||
if not isinstance(o, string_types) or not o:
|
||||
return False
|
||||
return o[0] in '\'"'
|
||||
|
||||
def _get_versions(s):
|
||||
result = []
|
||||
for m in _VERSION_PATTERN.finditer(s):
|
||||
result.append(NV(m.groups()[0]))
|
||||
return set(result)
|
||||
|
||||
class Evaluator(object):
|
||||
"""
|
||||
This class is used to evaluate marker expessions.
|
||||
@@ -70,9 +80,18 @@ class Evaluator(object):
|
||||
|
||||
lhs = self.evaluate(elhs, context)
|
||||
rhs = self.evaluate(erhs, context)
|
||||
if ((elhs == 'python_version' or erhs == 'python_version') and
|
||||
op in ('<', '<=', '>', '>=', '===', '==', '!=', '~=')):
|
||||
lhs = NV(lhs)
|
||||
rhs = NV(rhs)
|
||||
elif elhs == 'python_version' and op in ('in', 'not in'):
|
||||
lhs = NV(lhs)
|
||||
rhs = _get_versions(rhs)
|
||||
result = self.operations[op](lhs, rhs)
|
||||
return result
|
||||
|
||||
_DIGITS = re.compile(r'\d+\.\d+')
|
||||
|
||||
def default_context():
|
||||
def format_full_version(info):
|
||||
version = '%s.%s.%s' % (info.major, info.minor, info.micro)
|
||||
@@ -88,6 +107,9 @@ def default_context():
|
||||
implementation_version = '0'
|
||||
implementation_name = ''
|
||||
|
||||
ppv = platform.python_version()
|
||||
m = _DIGITS.match(ppv)
|
||||
pv = m.group(0)
|
||||
result = {
|
||||
'implementation_name': implementation_name,
|
||||
'implementation_version': implementation_version,
|
||||
@@ -98,8 +120,8 @@ def default_context():
|
||||
'platform_system': platform.system(),
|
||||
'platform_version': platform.version(),
|
||||
'platform_in_venv': str(in_venv()),
|
||||
'python_full_version': platform.python_version(),
|
||||
'python_version': platform.python_version()[:3],
|
||||
'python_full_version': ppv,
|
||||
'python_version': pv,
|
||||
'sys_platform': sys.platform,
|
||||
}
|
||||
return result
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#
|
||||
"""Implementation of the Metadata for Python packages PEPs.
|
||||
|
||||
Supports all metadata formats (1.0, 1.1, 1.2, 1.3/2.1 and withdrawn 2.0).
|
||||
Supports all metadata formats (1.0, 1.1, 1.2, 1.3/2.1 and 2.2).
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
@@ -100,12 +100,17 @@ _566_FIELDS = _426_FIELDS + ('Description-Content-Type',
|
||||
|
||||
_566_MARKERS = ('Description-Content-Type',)
|
||||
|
||||
_643_MARKERS = ('Dynamic', 'License-File')
|
||||
|
||||
_643_FIELDS = _566_FIELDS + _643_MARKERS
|
||||
|
||||
_ALL_FIELDS = set()
|
||||
_ALL_FIELDS.update(_241_FIELDS)
|
||||
_ALL_FIELDS.update(_314_FIELDS)
|
||||
_ALL_FIELDS.update(_345_FIELDS)
|
||||
_ALL_FIELDS.update(_426_FIELDS)
|
||||
_ALL_FIELDS.update(_566_FIELDS)
|
||||
_ALL_FIELDS.update(_643_FIELDS)
|
||||
|
||||
EXTRA_RE = re.compile(r'''extra\s*==\s*("([^"]+)"|'([^']+)')''')
|
||||
|
||||
@@ -121,7 +126,10 @@ def _version2fieldlist(version):
|
||||
# avoid adding field names if already there
|
||||
return _345_FIELDS + tuple(f for f in _566_FIELDS if f not in _345_FIELDS)
|
||||
elif version == '2.0':
|
||||
return _426_FIELDS
|
||||
raise ValueError('Metadata 2.0 is withdrawn and not supported')
|
||||
# return _426_FIELDS
|
||||
elif version == '2.2':
|
||||
return _643_FIELDS
|
||||
raise MetadataUnrecognizedVersionError(version)
|
||||
|
||||
|
||||
@@ -139,7 +147,7 @@ def _best_version(fields):
|
||||
continue
|
||||
keys.append(key)
|
||||
|
||||
possible_versions = ['1.0', '1.1', '1.2', '1.3', '2.0', '2.1']
|
||||
possible_versions = ['1.0', '1.1', '1.2', '1.3', '2.1', '2.2'] # 2.0 removed
|
||||
|
||||
# first let's try to see if a field is not part of one of the version
|
||||
for key in keys:
|
||||
@@ -159,9 +167,12 @@ def _best_version(fields):
|
||||
if key != 'Description': # In 2.1, description allowed after headers
|
||||
possible_versions.remove('2.1')
|
||||
logger.debug('Removed 2.1 due to %s', key)
|
||||
if key not in _426_FIELDS and '2.0' in possible_versions:
|
||||
possible_versions.remove('2.0')
|
||||
logger.debug('Removed 2.0 due to %s', key)
|
||||
if key not in _643_FIELDS and '2.2' in possible_versions:
|
||||
possible_versions.remove('2.2')
|
||||
logger.debug('Removed 2.2 due to %s', key)
|
||||
# if key not in _426_FIELDS and '2.0' in possible_versions:
|
||||
# possible_versions.remove('2.0')
|
||||
# logger.debug('Removed 2.0 due to %s', key)
|
||||
|
||||
# possible_version contains qualified versions
|
||||
if len(possible_versions) == 1:
|
||||
@@ -174,16 +185,18 @@ def _best_version(fields):
|
||||
is_1_1 = '1.1' in possible_versions and _has_marker(keys, _314_MARKERS)
|
||||
is_1_2 = '1.2' in possible_versions and _has_marker(keys, _345_MARKERS)
|
||||
is_2_1 = '2.1' in possible_versions and _has_marker(keys, _566_MARKERS)
|
||||
is_2_0 = '2.0' in possible_versions and _has_marker(keys, _426_MARKERS)
|
||||
if int(is_1_1) + int(is_1_2) + int(is_2_1) + int(is_2_0) > 1:
|
||||
raise MetadataConflictError('You used incompatible 1.1/1.2/2.0/2.1 fields')
|
||||
# is_2_0 = '2.0' in possible_versions and _has_marker(keys, _426_MARKERS)
|
||||
is_2_2 = '2.2' in possible_versions and _has_marker(keys, _643_MARKERS)
|
||||
if int(is_1_1) + int(is_1_2) + int(is_2_1) + int(is_2_2) > 1:
|
||||
raise MetadataConflictError('You used incompatible 1.1/1.2/2.1/2.2 fields')
|
||||
|
||||
# we have the choice, 1.0, or 1.2, or 2.0
|
||||
# we have the choice, 1.0, or 1.2, 2.1 or 2.2
|
||||
# - 1.0 has a broken Summary field but works with all tools
|
||||
# - 1.1 is to avoid
|
||||
# - 1.2 fixes Summary but has little adoption
|
||||
# - 2.0 adds more features and is very new
|
||||
if not is_1_1 and not is_1_2 and not is_2_1 and not is_2_0:
|
||||
# - 2.1 adds more features
|
||||
# - 2.2 is the latest
|
||||
if not is_1_1 and not is_1_2 and not is_2_1 and not is_2_2:
|
||||
# we couldn't find any specific marker
|
||||
if PKG_INFO_PREFERRED_VERSION in possible_versions:
|
||||
return PKG_INFO_PREFERRED_VERSION
|
||||
@@ -193,8 +206,10 @@ def _best_version(fields):
|
||||
return '1.2'
|
||||
if is_2_1:
|
||||
return '2.1'
|
||||
# if is_2_2:
|
||||
# return '2.2'
|
||||
|
||||
return '2.0'
|
||||
return '2.2'
|
||||
|
||||
# This follows the rules about transforming keys as described in
|
||||
# https://www.python.org/dev/peps/pep-0566/#id17
|
||||
@@ -210,7 +225,7 @@ _LISTFIELDS = ('Platform', 'Classifier', 'Obsoletes',
|
||||
'Requires', 'Provides', 'Obsoletes-Dist',
|
||||
'Provides-Dist', 'Requires-Dist', 'Requires-External',
|
||||
'Project-URL', 'Supported-Platform', 'Setup-Requires-Dist',
|
||||
'Provides-Extra', 'Extension')
|
||||
'Provides-Extra', 'Extension', 'License-File')
|
||||
_LISTTUPLEFIELDS = ('Project-URL',)
|
||||
|
||||
_ELEMENTSFIELD = ('Keywords',)
|
||||
@@ -602,7 +617,7 @@ LEGACY_METADATA_FILENAME = 'METADATA'
|
||||
|
||||
class Metadata(object):
|
||||
"""
|
||||
The metadata of a release. This implementation uses 2.0 (JSON)
|
||||
The metadata of a release. This implementation uses 2.1
|
||||
metadata where possible. If not possible, it wraps a LegacyMetadata
|
||||
instance which handles the key-value metadata format.
|
||||
"""
|
||||
@@ -611,6 +626,8 @@ class Metadata(object):
|
||||
|
||||
NAME_MATCHER = re.compile('^[0-9A-Z]([0-9A-Z_.-]*[0-9A-Z])?$', re.I)
|
||||
|
||||
FIELDNAME_MATCHER = re.compile('^[A-Z]([0-9A-Z-]*[0-9A-Z])?$', re.I)
|
||||
|
||||
VERSION_MATCHER = PEP440_VERSION_RE
|
||||
|
||||
SUMMARY_MATCHER = re.compile('.{1,2047}')
|
||||
@@ -638,6 +655,7 @@ class Metadata(object):
|
||||
'name': (NAME_MATCHER, ('legacy',)),
|
||||
'version': (VERSION_MATCHER, ('legacy',)),
|
||||
'summary': (SUMMARY_MATCHER, ('legacy',)),
|
||||
'dynamic': (FIELDNAME_MATCHER, ('legacy',)),
|
||||
}
|
||||
|
||||
__slots__ = ('_legacy', '_data', 'scheme')
|
||||
|
||||
@@ -10,11 +10,13 @@ import os
|
||||
import re
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
from zipfile import ZipInfo
|
||||
|
||||
from .compat import sysconfig, detect_encoding, ZipFile
|
||||
from .resources import finder
|
||||
from .util import (FileOperator, get_export_entry, convert_path,
|
||||
get_executable, in_venv)
|
||||
get_executable, get_platform, in_venv)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -170,6 +172,11 @@ class ScriptMaker(object):
|
||||
sysconfig.get_config_var('BINDIR'),
|
||||
'python%s%s' % (sysconfig.get_config_var('VERSION'),
|
||||
sysconfig.get_config_var('EXE')))
|
||||
if not os.path.isfile(executable):
|
||||
# for Python builds from source on Windows, no Python executables with
|
||||
# a version suffix are created, so we use python.exe
|
||||
executable = os.path.join(sysconfig.get_config_var('BINDIR'),
|
||||
'python%s' % (sysconfig.get_config_var('EXE')))
|
||||
if options:
|
||||
executable = self._get_alternate_executable(executable, options)
|
||||
|
||||
@@ -244,7 +251,13 @@ class ScriptMaker(object):
|
||||
launcher = self._get_launcher('w')
|
||||
stream = BytesIO()
|
||||
with ZipFile(stream, 'w') as zf:
|
||||
zf.writestr('__main__.py', script_bytes)
|
||||
source_date_epoch = os.environ.get('SOURCE_DATE_EPOCH')
|
||||
if source_date_epoch:
|
||||
date_time = time.gmtime(int(source_date_epoch))[:6]
|
||||
zinfo = ZipInfo(filename='__main__.py', date_time=date_time)
|
||||
zf.writestr(zinfo, script_bytes)
|
||||
else:
|
||||
zf.writestr('__main__.py', script_bytes)
|
||||
zip_data = stream.getvalue()
|
||||
script_bytes = launcher + shebang + zip_data
|
||||
for name in names:
|
||||
@@ -379,7 +392,8 @@ class ScriptMaker(object):
|
||||
bits = '64'
|
||||
else:
|
||||
bits = '32'
|
||||
name = '%s%s.exe' % (kind, bits)
|
||||
platform_suffix = '-arm' if get_platform() == 'win-arm64' else ''
|
||||
name = '%s%s%s.exe' % (kind, bits, platform_suffix)
|
||||
# Issue 31: don't hardcode an absolute package name, but
|
||||
# determine it relative to the current package
|
||||
distlib_package = __name__.rsplit('.', 1)[0]
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -215,6 +215,10 @@ def parse_requirement(req):
|
||||
if not ver_remaining or ver_remaining[0] != ',':
|
||||
break
|
||||
ver_remaining = ver_remaining[1:].lstrip()
|
||||
# Some packages have a trailing comma which would break things
|
||||
# See issue #148
|
||||
if not ver_remaining:
|
||||
break
|
||||
m = COMPARE_OP.match(ver_remaining)
|
||||
if not m:
|
||||
raise SyntaxError('invalid constraint: %s' % ver_remaining)
|
||||
@@ -1428,29 +1432,19 @@ if ssl:
|
||||
self.sock = sock
|
||||
self._tunnel()
|
||||
|
||||
if not hasattr(ssl, 'SSLContext'):
|
||||
# For 2.x
|
||||
if self.ca_certs:
|
||||
cert_reqs = ssl.CERT_REQUIRED
|
||||
else:
|
||||
cert_reqs = ssl.CERT_NONE
|
||||
self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file,
|
||||
cert_reqs=cert_reqs,
|
||||
ssl_version=ssl.PROTOCOL_SSLv23,
|
||||
ca_certs=self.ca_certs)
|
||||
else: # pragma: no cover
|
||||
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
||||
if hasattr(ssl, 'OP_NO_SSLv2'):
|
||||
context.options |= ssl.OP_NO_SSLv2
|
||||
if self.cert_file:
|
||||
context.load_cert_chain(self.cert_file, self.key_file)
|
||||
kwargs = {}
|
||||
if self.ca_certs:
|
||||
context.verify_mode = ssl.CERT_REQUIRED
|
||||
context.load_verify_locations(cafile=self.ca_certs)
|
||||
if getattr(ssl, 'HAS_SNI', False):
|
||||
kwargs['server_hostname'] = self.host
|
||||
self.sock = context.wrap_socket(sock, **kwargs)
|
||||
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
||||
if hasattr(ssl, 'OP_NO_SSLv2'):
|
||||
context.options |= ssl.OP_NO_SSLv2
|
||||
if self.cert_file:
|
||||
context.load_cert_chain(self.cert_file, self.key_file)
|
||||
kwargs = {}
|
||||
if self.ca_certs:
|
||||
context.verify_mode = ssl.CERT_REQUIRED
|
||||
context.load_verify_locations(cafile=self.ca_certs)
|
||||
if getattr(ssl, 'HAS_SNI', False):
|
||||
kwargs['server_hostname'] = self.host
|
||||
|
||||
self.sock = context.wrap_socket(sock, **kwargs)
|
||||
if self.ca_certs and self.check_domain:
|
||||
try:
|
||||
match_hostname(self.sock.getpeercert(), self.host)
|
||||
@@ -1509,25 +1503,6 @@ if ssl:
|
||||
#
|
||||
# XML-RPC with timeouts
|
||||
#
|
||||
|
||||
_ver_info = sys.version_info[:2]
|
||||
|
||||
if _ver_info == (2, 6):
|
||||
class HTTP(httplib.HTTP):
|
||||
def __init__(self, host='', port=None, **kwargs):
|
||||
if port == 0: # 0 means use port 0, not the default port
|
||||
port = None
|
||||
self._setup(self._connection_class(host, port, **kwargs))
|
||||
|
||||
|
||||
if ssl:
|
||||
class HTTPS(httplib.HTTPS):
|
||||
def __init__(self, host='', port=None, **kwargs):
|
||||
if port == 0: # 0 means use port 0, not the default port
|
||||
port = None
|
||||
self._setup(self._connection_class(host, port, **kwargs))
|
||||
|
||||
|
||||
class Transport(xmlrpclib.Transport):
|
||||
def __init__(self, timeout, use_datetime=0):
|
||||
self.timeout = timeout
|
||||
@@ -1535,14 +1510,10 @@ class Transport(xmlrpclib.Transport):
|
||||
|
||||
def make_connection(self, host):
|
||||
h, eh, x509 = self.get_host_info(host)
|
||||
if _ver_info == (2, 6):
|
||||
result = HTTP(h, timeout=self.timeout)
|
||||
else:
|
||||
if not self._connection or host != self._connection[0]:
|
||||
self._extra_headers = eh
|
||||
self._connection = host, httplib.HTTPConnection(h)
|
||||
result = self._connection[1]
|
||||
return result
|
||||
if not self._connection or host != self._connection[0]:
|
||||
self._extra_headers = eh
|
||||
self._connection = host, httplib.HTTPConnection(h)
|
||||
return self._connection[1]
|
||||
|
||||
if ssl:
|
||||
class SafeTransport(xmlrpclib.SafeTransport):
|
||||
@@ -1555,15 +1526,11 @@ if ssl:
|
||||
if not kwargs:
|
||||
kwargs = {}
|
||||
kwargs['timeout'] = self.timeout
|
||||
if _ver_info == (2, 6):
|
||||
result = HTTPS(host, None, **kwargs)
|
||||
else:
|
||||
if not self._connection or host != self._connection[0]:
|
||||
self._extra_headers = eh
|
||||
self._connection = host, httplib.HTTPSConnection(h, None,
|
||||
**kwargs)
|
||||
result = self._connection[1]
|
||||
return result
|
||||
if not self._connection or host != self._connection[0]:
|
||||
self._extra_headers = eh
|
||||
self._connection = host, httplib.HTTPSConnection(h, None,
|
||||
**kwargs)
|
||||
return self._connection[1]
|
||||
|
||||
|
||||
class ServerProxy(xmlrpclib.ServerProxy):
|
||||
|
||||
@@ -194,7 +194,7 @@ def _pep_440_key(s):
|
||||
if not groups[0]:
|
||||
epoch = 0
|
||||
else:
|
||||
epoch = int(groups[0])
|
||||
epoch = int(groups[0][:-1])
|
||||
pre = groups[4:6]
|
||||
post = groups[7:9]
|
||||
dev = groups[10:12]
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -11,7 +11,6 @@ import codecs
|
||||
import datetime
|
||||
from email import message_from_file
|
||||
import hashlib
|
||||
import imp
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -47,10 +46,7 @@ else:
|
||||
|
||||
VER_SUFFIX = sysconfig.get_config_var('py_version_nodot')
|
||||
if not VER_SUFFIX: # pragma: no cover
|
||||
if sys.version_info[1] >= 10:
|
||||
VER_SUFFIX = '%s_%s' % sys.version_info[:2] # PEP 641 (draft)
|
||||
else:
|
||||
VER_SUFFIX = '%s%s' % sys.version_info[:2]
|
||||
VER_SUFFIX = '%s%s' % sys.version_info[:2]
|
||||
PYVER = 'py' + VER_SUFFIX
|
||||
IMPVER = IMP_PREFIX + VER_SUFFIX
|
||||
|
||||
@@ -64,10 +60,18 @@ else:
|
||||
parts = ['cp', VER_SUFFIX]
|
||||
if sysconfig.get_config_var('Py_DEBUG'):
|
||||
parts.append('d')
|
||||
if sysconfig.get_config_var('WITH_PYMALLOC'):
|
||||
parts.append('m')
|
||||
if sysconfig.get_config_var('Py_UNICODE_SIZE') == 4:
|
||||
parts.append('u')
|
||||
if IMP_PREFIX == 'cp':
|
||||
vi = sys.version_info[:2]
|
||||
if vi < (3, 8):
|
||||
wpm = sysconfig.get_config_var('WITH_PYMALLOC')
|
||||
if wpm is None:
|
||||
wpm = True
|
||||
if wpm:
|
||||
parts.append('m')
|
||||
if vi < (3, 3):
|
||||
us = sysconfig.get_config_var('Py_UNICODE_SIZE')
|
||||
if us == 4 or (us is None and sys.maxunicode == 0x10FFFF):
|
||||
parts.append('u')
|
||||
return ''.join(parts)
|
||||
ABI = _derive_abi()
|
||||
del _derive_abi
|
||||
@@ -98,6 +102,29 @@ if os.sep == '/':
|
||||
else:
|
||||
to_posix = lambda o: o.replace(os.sep, '/')
|
||||
|
||||
if sys.version_info[0] < 3:
|
||||
import imp
|
||||
else:
|
||||
imp = None
|
||||
import importlib.machinery
|
||||
import importlib.util
|
||||
|
||||
def _get_suffixes():
|
||||
if imp:
|
||||
return [s[0] for s in imp.get_suffixes()]
|
||||
else:
|
||||
return importlib.machinery.EXTENSION_SUFFIXES
|
||||
|
||||
def _load_dynamic(name, path):
|
||||
# https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
|
||||
if imp:
|
||||
return imp.load_dynamic(name, path)
|
||||
else:
|
||||
spec = importlib.util.spec_from_file_location(name, path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[name] = module
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
class Mounter(object):
|
||||
def __init__(self):
|
||||
@@ -127,7 +154,7 @@ class Mounter(object):
|
||||
else:
|
||||
if fullname not in self.libs:
|
||||
raise ImportError('unable to find extension for %s' % fullname)
|
||||
result = imp.load_dynamic(fullname, self.libs[fullname])
|
||||
result = _load_dynamic(fullname, self.libs[fullname])
|
||||
result.__loader__ = self
|
||||
parts = fullname.rsplit('.', 1)
|
||||
if len(parts) > 1:
|
||||
@@ -304,10 +331,9 @@ class Wheel(object):
|
||||
result = base64.urlsafe_b64encode(result).rstrip(b'=').decode('ascii')
|
||||
return hash_kind, result
|
||||
|
||||
def write_record(self, records, record_path, base):
|
||||
def write_record(self, records, record_path, archive_record_path):
|
||||
records = list(records) # make a copy, as mutated
|
||||
p = to_posix(os.path.relpath(record_path, base))
|
||||
records.append((p, '', ''))
|
||||
records.append((archive_record_path, '', ''))
|
||||
with CSVWriter(record_path) as writer:
|
||||
for row in records:
|
||||
writer.writerow(row)
|
||||
@@ -324,8 +350,8 @@ class Wheel(object):
|
||||
records.append((ap, digest, size))
|
||||
|
||||
p = os.path.join(distinfo, 'RECORD')
|
||||
self.write_record(records, p, libdir)
|
||||
ap = to_posix(os.path.join(info_dir, 'RECORD'))
|
||||
self.write_record(records, p, ap)
|
||||
archive_paths.append((ap, p))
|
||||
|
||||
def build_zip(self, pathname, archive_paths):
|
||||
@@ -968,7 +994,7 @@ def compatible_tags():
|
||||
versions.append(''.join([major, str(minor)]))
|
||||
|
||||
abis = []
|
||||
for suffix, _, _ in imp.get_suffixes():
|
||||
for suffix in _get_suffixes():
|
||||
if suffix.startswith('.abi'):
|
||||
abis.append(suffix.split('.', 2)[1])
|
||||
abis.sort()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,35 +0,0 @@
|
||||
"""
|
||||
HTML parsing library based on the `WHATWG HTML specification
|
||||
<https://whatwg.org/html>`_. The parser is designed to be compatible with
|
||||
existing HTML found in the wild and implements well-defined error recovery that
|
||||
is largely compatible with modern desktop web browsers.
|
||||
|
||||
Example usage::
|
||||
|
||||
from pip._vendor import html5lib
|
||||
with open("my_document.html", "rb") as f:
|
||||
tree = html5lib.parse(f)
|
||||
|
||||
For convenience, this module re-exports the following names:
|
||||
|
||||
* :func:`~.html5parser.parse`
|
||||
* :func:`~.html5parser.parseFragment`
|
||||
* :class:`~.html5parser.HTMLParser`
|
||||
* :func:`~.treebuilders.getTreeBuilder`
|
||||
* :func:`~.treewalkers.getTreeWalker`
|
||||
* :func:`~.serializer.serialize`
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .html5parser import HTMLParser, parse, parseFragment
|
||||
from .treebuilders import getTreeBuilder
|
||||
from .treewalkers import getTreeWalker
|
||||
from .serializer import serialize
|
||||
|
||||
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
||||
"getTreeWalker", "serialize"]
|
||||
|
||||
# this has to be at the top level, see how setup.py parses this
|
||||
#: Distribution version number.
|
||||
__version__ = "1.1"
|
||||
@@ -1,289 +0,0 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
from .constants import DataLossWarning
|
||||
|
||||
baseChar = """
|
||||
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
|
||||
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
|
||||
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
|
||||
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
|
||||
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
|
||||
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
|
||||
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
|
||||
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
|
||||
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
|
||||
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
|
||||
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
|
||||
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
|
||||
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
|
||||
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
|
||||
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
|
||||
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
|
||||
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
|
||||
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
|
||||
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
|
||||
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
|
||||
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
|
||||
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
|
||||
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
|
||||
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
|
||||
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
|
||||
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
|
||||
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
|
||||
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
|
||||
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
|
||||
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
|
||||
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
|
||||
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
|
||||
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
|
||||
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
|
||||
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
|
||||
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
|
||||
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
|
||||
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
|
||||
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
|
||||
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
|
||||
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
|
||||
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
|
||||
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
|
||||
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
|
||||
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
|
||||
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
||||
|
||||
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
||||
|
||||
combiningCharacter = """
|
||||
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
|
||||
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
|
||||
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
|
||||
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
|
||||
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
|
||||
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
|
||||
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
|
||||
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
|
||||
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
|
||||
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
|
||||
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
|
||||
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
|
||||
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
|
||||
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
|
||||
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
|
||||
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
|
||||
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
|
||||
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
|
||||
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
|
||||
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
|
||||
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
|
||||
#x3099 | #x309A"""
|
||||
|
||||
digit = """
|
||||
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
|
||||
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
|
||||
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
|
||||
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
||||
|
||||
extender = """
|
||||
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
|
||||
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
||||
|
||||
letter = " | ".join([baseChar, ideographic])
|
||||
|
||||
# Without the
|
||||
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
|
||||
extender])
|
||||
nameFirst = " | ".join([letter, "_"])
|
||||
|
||||
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
||||
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
||||
|
||||
|
||||
def charStringToList(chars):
|
||||
charRanges = [item.strip() for item in chars.split(" | ")]
|
||||
rv = []
|
||||
for item in charRanges:
|
||||
foundMatch = False
|
||||
for regexp in (reChar, reCharRange):
|
||||
match = regexp.match(item)
|
||||
if match is not None:
|
||||
rv.append([hexToInt(item) for item in match.groups()])
|
||||
if len(rv[-1]) == 1:
|
||||
rv[-1] = rv[-1] * 2
|
||||
foundMatch = True
|
||||
break
|
||||
if not foundMatch:
|
||||
assert len(item) == 1
|
||||
|
||||
rv.append([ord(item)] * 2)
|
||||
rv = normaliseCharList(rv)
|
||||
return rv
|
||||
|
||||
|
||||
def normaliseCharList(charList):
|
||||
charList = sorted(charList)
|
||||
for item in charList:
|
||||
assert item[1] >= item[0]
|
||||
rv = []
|
||||
i = 0
|
||||
while i < len(charList):
|
||||
j = 1
|
||||
rv.append(charList[i])
|
||||
while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
|
||||
rv[-1][1] = charList[i + j][1]
|
||||
j += 1
|
||||
i += j
|
||||
return rv
|
||||
|
||||
|
||||
# We don't really support characters above the BMP :(
|
||||
max_unicode = int("FFFF", 16)
|
||||
|
||||
|
||||
def missingRanges(charList):
|
||||
rv = []
|
||||
if charList[0] != 0:
|
||||
rv.append([0, charList[0][0] - 1])
|
||||
for i, item in enumerate(charList[:-1]):
|
||||
rv.append([item[1] + 1, charList[i + 1][0] - 1])
|
||||
if charList[-1][1] != max_unicode:
|
||||
rv.append([charList[-1][1] + 1, max_unicode])
|
||||
return rv
|
||||
|
||||
|
||||
def listToRegexpStr(charList):
|
||||
rv = []
|
||||
for item in charList:
|
||||
if item[0] == item[1]:
|
||||
rv.append(escapeRegexp(chr(item[0])))
|
||||
else:
|
||||
rv.append(escapeRegexp(chr(item[0])) + "-" +
|
||||
escapeRegexp(chr(item[1])))
|
||||
return "[%s]" % "".join(rv)
|
||||
|
||||
|
||||
def hexToInt(hex_str):
|
||||
return int(hex_str, 16)
|
||||
|
||||
|
||||
def escapeRegexp(string):
|
||||
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
||||
"[", "]", "|", "(", ")", "-")
|
||||
for char in specialCharacters:
|
||||
string = string.replace(char, "\\" + char)
|
||||
|
||||
return string
|
||||
|
||||
# output from the above
|
||||
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||
|
||||
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||
|
||||
# Simpler things
|
||||
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
|
||||
|
||||
|
||||
class InfosetFilter(object):
|
||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||
|
||||
def __init__(self,
|
||||
dropXmlnsLocalName=False,
|
||||
dropXmlnsAttrNs=False,
|
||||
preventDoubleDashComments=False,
|
||||
preventDashAtCommentEnd=False,
|
||||
replaceFormFeedCharacters=True,
|
||||
preventSingleQuotePubid=False):
|
||||
|
||||
self.dropXmlnsLocalName = dropXmlnsLocalName
|
||||
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
||||
|
||||
self.preventDoubleDashComments = preventDoubleDashComments
|
||||
self.preventDashAtCommentEnd = preventDashAtCommentEnd
|
||||
|
||||
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
||||
|
||||
self.preventSingleQuotePubid = preventSingleQuotePubid
|
||||
|
||||
self.replaceCache = {}
|
||||
|
||||
def coerceAttribute(self, name, namespace=None):
|
||||
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
||||
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
|
||||
return None
|
||||
elif (self.dropXmlnsAttrNs and
|
||||
namespace == "http://www.w3.org/2000/xmlns/"):
|
||||
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
|
||||
return None
|
||||
else:
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceElement(self, name):
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceComment(self, data):
|
||||
if self.preventDoubleDashComments:
|
||||
while "--" in data:
|
||||
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
||||
data = data.replace("--", "- -")
|
||||
if data.endswith("-"):
|
||||
warnings.warn("Comments cannot end in a dash", DataLossWarning)
|
||||
data += " "
|
||||
return data
|
||||
|
||||
def coerceCharacters(self, data):
|
||||
if self.replaceFormFeedCharacters:
|
||||
for _ in range(data.count("\x0C")):
|
||||
warnings.warn("Text cannot contain U+000C", DataLossWarning)
|
||||
data = data.replace("\x0C", " ")
|
||||
# Other non-xml characters
|
||||
return data
|
||||
|
||||
def coercePubid(self, data):
|
||||
dataOutput = data
|
||||
for char in nonPubidCharRegexp.findall(data):
|
||||
warnings.warn("Coercing non-XML pubid", DataLossWarning)
|
||||
replacement = self.getReplacementCharacter(char)
|
||||
dataOutput = dataOutput.replace(char, replacement)
|
||||
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
|
||||
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
|
||||
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
|
||||
return dataOutput
|
||||
|
||||
def toXmlName(self, name):
|
||||
nameFirst = name[0]
|
||||
nameRest = name[1:]
|
||||
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
||||
if m:
|
||||
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
|
||||
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
||||
else:
|
||||
nameFirstOutput = nameFirst
|
||||
|
||||
nameRestOutput = nameRest
|
||||
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
||||
for char in replaceChars:
|
||||
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
|
||||
replacement = self.getReplacementCharacter(char)
|
||||
nameRestOutput = nameRestOutput.replace(char, replacement)
|
||||
return nameFirstOutput + nameRestOutput
|
||||
|
||||
def getReplacementCharacter(self, char):
|
||||
if char in self.replaceCache:
|
||||
replacement = self.replaceCache[char]
|
||||
else:
|
||||
replacement = self.escapeChar(char)
|
||||
return replacement
|
||||
|
||||
def fromXmlName(self, name):
|
||||
for item in set(self.replacementRegexp.findall(name)):
|
||||
name = name.replace(item, self.unescapeChar(item))
|
||||
return name
|
||||
|
||||
def escapeChar(self, char):
|
||||
replacement = "U%05X" % ord(char)
|
||||
self.replaceCache[char] = replacement
|
||||
return replacement
|
||||
|
||||
def unescapeChar(self, charcode):
|
||||
return chr(int(charcode[1:], 16))
|
||||
@@ -1,918 +0,0 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from pip._vendor.six import text_type
|
||||
from pip._vendor.six.moves import http_client, urllib
|
||||
|
||||
import codecs
|
||||
import re
|
||||
from io import BytesIO, StringIO
|
||||
|
||||
from pip._vendor import webencodings
|
||||
|
||||
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from .constants import _ReparseException
|
||||
from . import _utils
|
||||
|
||||
# Non-unicode versions of constants for use in the pre-parser
|
||||
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
||||
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
||||
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||
|
||||
|
||||
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
|
||||
|
||||
if _utils.supports_lone_surrogates:
|
||||
# Use one extra step of indirection and create surrogates with
|
||||
# eval. Not using this indirection would introduce an illegal
|
||||
# unicode literal on platforms not supporting such lone
|
||||
# surrogates.
|
||||
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
|
||||
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
|
||||
"]")
|
||||
else:
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
|
||||
|
||||
non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
||||
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
||||
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||
0x10FFFE, 0x10FFFF}
|
||||
|
||||
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
|
||||
|
||||
# Cache for charsUntil()
|
||||
charsUntilRegEx = {}
|
||||
|
||||
|
||||
class BufferedStream(object):
|
||||
"""Buffering for streams that do not have buffering of their own
|
||||
|
||||
The buffer is implemented as a list of chunks on the assumption that
|
||||
joining many strings will be slow since it is O(n**2)
|
||||
"""
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
self.buffer = []
|
||||
self.position = [-1, 0] # chunk number, offset
|
||||
|
||||
def tell(self):
|
||||
pos = 0
|
||||
for chunk in self.buffer[:self.position[0]]:
|
||||
pos += len(chunk)
|
||||
pos += self.position[1]
|
||||
return pos
|
||||
|
||||
def seek(self, pos):
|
||||
assert pos <= self._bufferedBytes()
|
||||
offset = pos
|
||||
i = 0
|
||||
while len(self.buffer[i]) < offset:
|
||||
offset -= len(self.buffer[i])
|
||||
i += 1
|
||||
self.position = [i, offset]
|
||||
|
||||
def read(self, bytes):
|
||||
if not self.buffer:
|
||||
return self._readStream(bytes)
|
||||
elif (self.position[0] == len(self.buffer) and
|
||||
self.position[1] == len(self.buffer[-1])):
|
||||
return self._readStream(bytes)
|
||||
else:
|
||||
return self._readFromBuffer(bytes)
|
||||
|
||||
def _bufferedBytes(self):
|
||||
return sum([len(item) for item in self.buffer])
|
||||
|
||||
def _readStream(self, bytes):
|
||||
data = self.stream.read(bytes)
|
||||
self.buffer.append(data)
|
||||
self.position[0] += 1
|
||||
self.position[1] = len(data)
|
||||
return data
|
||||
|
||||
def _readFromBuffer(self, bytes):
|
||||
remainingBytes = bytes
|
||||
rv = []
|
||||
bufferIndex = self.position[0]
|
||||
bufferOffset = self.position[1]
|
||||
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
||||
assert remainingBytes > 0
|
||||
bufferedData = self.buffer[bufferIndex]
|
||||
|
||||
if remainingBytes <= len(bufferedData) - bufferOffset:
|
||||
bytesToRead = remainingBytes
|
||||
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
||||
else:
|
||||
bytesToRead = len(bufferedData) - bufferOffset
|
||||
self.position = [bufferIndex, len(bufferedData)]
|
||||
bufferIndex += 1
|
||||
rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
|
||||
remainingBytes -= bytesToRead
|
||||
|
||||
bufferOffset = 0
|
||||
|
||||
if remainingBytes:
|
||||
rv.append(self._readStream(remainingBytes))
|
||||
|
||||
return b"".join(rv)
|
||||
|
||||
|
||||
def HTMLInputStream(source, **kwargs):
|
||||
# Work around Python bug #20007: read(0) closes the connection.
|
||||
# http://bugs.python.org/issue20007
|
||||
if (isinstance(source, http_client.HTTPResponse) or
|
||||
# Also check for addinfourl wrapping HTTPResponse
|
||||
(isinstance(source, urllib.response.addbase) and
|
||||
isinstance(source.fp, http_client.HTTPResponse))):
|
||||
isUnicode = False
|
||||
elif hasattr(source, "read"):
|
||||
isUnicode = isinstance(source.read(0), text_type)
|
||||
else:
|
||||
isUnicode = isinstance(source, text_type)
|
||||
|
||||
if isUnicode:
|
||||
encodings = [x for x in kwargs if x.endswith("_encoding")]
|
||||
if encodings:
|
||||
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
|
||||
|
||||
return HTMLUnicodeInputStream(source, **kwargs)
|
||||
else:
|
||||
return HTMLBinaryInputStream(source, **kwargs)
|
||||
|
||||
|
||||
class HTMLUnicodeInputStream(object):
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
"""
|
||||
|
||||
_defaultChunkSize = 10240
|
||||
|
||||
def __init__(self, source):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by html5lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
"""
|
||||
|
||||
if not _utils.supports_lone_surrogates:
|
||||
# Such platforms will have already checked for such
|
||||
# surrogate errors, so no need to do this checking.
|
||||
self.reportCharacterErrors = None
|
||||
elif len("\U0010FFFF") == 1:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||
else:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||
|
||||
# List of where new lines occur
|
||||
self.newLines = [0]
|
||||
|
||||
self.charEncoding = (lookupEncoding("utf-8"), "certain")
|
||||
self.dataStream = self.openStream(source)
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.chunk = ""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
self.errors = []
|
||||
|
||||
# number of (complete) lines in previous chunks
|
||||
self.prevNumLines = 0
|
||||
# number of columns in the last line of the previous chunk
|
||||
self.prevNumCols = 0
|
||||
|
||||
# Deal with CR LF and surrogates split over chunk boundaries
|
||||
self._bufferedCharacter = None
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
|
||||
source can be either a file object, local filename or a string.
|
||||
|
||||
"""
|
||||
# Already a file object
|
||||
if hasattr(source, 'read'):
|
||||
stream = source
|
||||
else:
|
||||
stream = StringIO(source)
|
||||
|
||||
return stream
|
||||
|
||||
def _position(self, offset):
|
||||
chunk = self.chunk
|
||||
nLines = chunk.count('\n', 0, offset)
|
||||
positionLine = self.prevNumLines + nLines
|
||||
lastLinePos = chunk.rfind('\n', 0, offset)
|
||||
if lastLinePos == -1:
|
||||
positionColumn = self.prevNumCols + offset
|
||||
else:
|
||||
positionColumn = offset - (lastLinePos + 1)
|
||||
return (positionLine, positionColumn)
|
||||
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
line, col = self._position(self.chunkOffset)
|
||||
return (line + 1, col)
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
EOF when EOF is reached.
|
||||
"""
|
||||
# Read a new chunk from the input stream if necessary
|
||||
if self.chunkOffset >= self.chunkSize:
|
||||
if not self.readChunk():
|
||||
return EOF
|
||||
|
||||
chunkOffset = self.chunkOffset
|
||||
char = self.chunk[chunkOffset]
|
||||
self.chunkOffset = chunkOffset + 1
|
||||
|
||||
return char
|
||||
|
||||
def readChunk(self, chunkSize=None):
|
||||
if chunkSize is None:
|
||||
chunkSize = self._defaultChunkSize
|
||||
|
||||
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
||||
|
||||
self.chunk = ""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
|
||||
data = self.dataStream.read(chunkSize)
|
||||
|
||||
# Deal with CR LF and surrogates broken across chunks
|
||||
if self._bufferedCharacter:
|
||||
data = self._bufferedCharacter + data
|
||||
self._bufferedCharacter = None
|
||||
elif not data:
|
||||
# We have no more data, bye-bye stream
|
||||
return False
|
||||
|
||||
if len(data) > 1:
|
||||
lastv = ord(data[-1])
|
||||
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
|
||||
self._bufferedCharacter = data[-1]
|
||||
data = data[:-1]
|
||||
|
||||
if self.reportCharacterErrors:
|
||||
self.reportCharacterErrors(data)
|
||||
|
||||
# Replace invalid characters
|
||||
data = data.replace("\r\n", "\n")
|
||||
data = data.replace("\r", "\n")
|
||||
|
||||
self.chunk = data
|
||||
self.chunkSize = len(data)
|
||||
|
||||
return True
|
||||
|
||||
def characterErrorsUCS4(self, data):
|
||||
for _ in range(len(invalid_unicode_re.findall(data))):
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def characterErrorsUCS2(self, data):
|
||||
# Someone picked the wrong compile option
|
||||
# You lose
|
||||
skip = False
|
||||
for match in invalid_unicode_re.finditer(data):
|
||||
if skip:
|
||||
continue
|
||||
codepoint = ord(match.group())
|
||||
pos = match.start()
|
||||
# Pretty sure there should be endianness issues here
|
||||
if _utils.isSurrogatePair(data[pos:pos + 2]):
|
||||
# We have a surrogate pair!
|
||||
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
||||
if char_val in non_bmp_invalid_codepoints:
|
||||
self.errors.append("invalid-codepoint")
|
||||
skip = True
|
||||
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
||||
pos == len(data) - 1):
|
||||
self.errors.append("invalid-codepoint")
|
||||
else:
|
||||
skip = False
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def charsUntil(self, characters, opposite=False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in 'characters' or EOF. 'characters' must be
|
||||
a container that supports the 'in' method and iteration over its
|
||||
characters.
|
||||
"""
|
||||
|
||||
# Use a cache of regexps to find the required characters
|
||||
try:
|
||||
chars = charsUntilRegEx[(characters, opposite)]
|
||||
except KeyError:
|
||||
if __debug__:
|
||||
for c in characters:
|
||||
assert(ord(c) < 128)
|
||||
regex = "".join(["\\x%02x" % ord(c) for c in characters])
|
||||
if not opposite:
|
||||
regex = "^%s" % regex
|
||||
chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
|
||||
|
||||
rv = []
|
||||
|
||||
while True:
|
||||
# Find the longest matching prefix
|
||||
m = chars.match(self.chunk, self.chunkOffset)
|
||||
if m is None:
|
||||
# If nothing matched, and it wasn't because we ran out of chunk,
|
||||
# then stop
|
||||
if self.chunkOffset != self.chunkSize:
|
||||
break
|
||||
else:
|
||||
end = m.end()
|
||||
# If not the whole chunk matched, return everything
|
||||
# up to the part that didn't match
|
||||
if end != self.chunkSize:
|
||||
rv.append(self.chunk[self.chunkOffset:end])
|
||||
self.chunkOffset = end
|
||||
break
|
||||
# If the whole remainder of the chunk matched,
|
||||
# use it all and read the next chunk
|
||||
rv.append(self.chunk[self.chunkOffset:])
|
||||
if not self.readChunk():
|
||||
# Reached EOF
|
||||
break
|
||||
|
||||
r = "".join(rv)
|
||||
return r
|
||||
|
||||
def unget(self, char):
|
||||
# Only one character is allowed to be ungotten at once - it must
|
||||
# be consumed again before any further call to unget
|
||||
if char is not EOF:
|
||||
if self.chunkOffset == 0:
|
||||
# unget is called quite rarely, so it's a good idea to do
|
||||
# more work here if it saves a bit of work in the frequently
|
||||
# called char and charsUntil.
|
||||
# So, just prepend the ungotten character onto the current
|
||||
# chunk:
|
||||
self.chunk = char + self.chunk
|
||||
self.chunkSize += 1
|
||||
else:
|
||||
self.chunkOffset -= 1
|
||||
assert self.chunk[self.chunkOffset] == char
|
||||
|
||||
|
||||
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, source, override_encoding=None, transport_encoding=None,
|
||||
same_origin_parent_encoding=None, likely_encoding=None,
|
||||
default_encoding="windows-1252", useChardet=True):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by html5lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
"""
|
||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||
# self.charEncoding as appropriate
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
||||
|
||||
# Encoding Information
|
||||
# Number of bytes to use when looking for a meta element with
|
||||
# encoding information
|
||||
self.numBytesMeta = 1024
|
||||
# Number of bytes to use when using detecting encoding using chardet
|
||||
self.numBytesChardet = 100
|
||||
# Things from args
|
||||
self.override_encoding = override_encoding
|
||||
self.transport_encoding = transport_encoding
|
||||
self.same_origin_parent_encoding = same_origin_parent_encoding
|
||||
self.likely_encoding = likely_encoding
|
||||
self.default_encoding = default_encoding
|
||||
|
||||
# Determine encoding
|
||||
self.charEncoding = self.determineEncoding(useChardet)
|
||||
assert self.charEncoding[0] is not None
|
||||
|
||||
# Call superclass
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
|
||||
HTMLUnicodeInputStream.reset(self)
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
|
||||
source can be either a file object, local filename or a string.
|
||||
|
||||
"""
|
||||
# Already a file object
|
||||
if hasattr(source, 'read'):
|
||||
stream = source
|
||||
else:
|
||||
stream = BytesIO(source)
|
||||
|
||||
try:
|
||||
stream.seek(stream.tell())
|
||||
except Exception:
|
||||
stream = BufferedStream(stream)
|
||||
|
||||
return stream
|
||||
|
||||
def determineEncoding(self, chardet=True):
|
||||
# BOMs take precedence over everything
|
||||
# This will also read past the BOM if present
|
||||
charEncoding = self.detectBOM(), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# If we've been overridden, we've been overridden
|
||||
charEncoding = lookupEncoding(self.override_encoding), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Now check the transport layer
|
||||
charEncoding = lookupEncoding(self.transport_encoding), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Look for meta elements with encoding information
|
||||
charEncoding = self.detectEncodingMeta(), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Parent document encoding
|
||||
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
|
||||
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
|
||||
return charEncoding
|
||||
|
||||
# "likely" encoding
|
||||
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Guess with chardet, if available
|
||||
if chardet:
|
||||
try:
|
||||
from pip._vendor.chardet.universaldetector import UniversalDetector
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
buffers = []
|
||||
detector = UniversalDetector()
|
||||
while not detector.done:
|
||||
buffer = self.rawStream.read(self.numBytesChardet)
|
||||
assert isinstance(buffer, bytes)
|
||||
if not buffer:
|
||||
break
|
||||
buffers.append(buffer)
|
||||
detector.feed(buffer)
|
||||
detector.close()
|
||||
encoding = lookupEncoding(detector.result['encoding'])
|
||||
self.rawStream.seek(0)
|
||||
if encoding is not None:
|
||||
return encoding, "tentative"
|
||||
|
||||
# Try the default encoding
|
||||
charEncoding = lookupEncoding(self.default_encoding), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Fallback to html5lib's default if even that hasn't worked
|
||||
return lookupEncoding("windows-1252"), "tentative"
|
||||
|
||||
def changeEncoding(self, newEncoding):
|
||||
assert self.charEncoding[1] != "certain"
|
||||
newEncoding = lookupEncoding(newEncoding)
|
||||
if newEncoding is None:
|
||||
return
|
||||
if newEncoding.name in ("utf-16be", "utf-16le"):
|
||||
newEncoding = lookupEncoding("utf-8")
|
||||
assert newEncoding is not None
|
||||
elif newEncoding == self.charEncoding[0]:
|
||||
self.charEncoding = (self.charEncoding[0], "certain")
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
self.charEncoding = (newEncoding, "certain")
|
||||
self.reset()
|
||||
raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
||||
|
||||
def detectBOM(self):
|
||||
"""Attempts to detect at BOM at the start of the stream. If
|
||||
an encoding can be determined from the BOM return the name of the
|
||||
encoding otherwise return None"""
|
||||
bomDict = {
|
||||
codecs.BOM_UTF8: 'utf-8',
|
||||
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
|
||||
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
string = self.rawStream.read(4)
|
||||
assert isinstance(string, bytes)
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
seek = 3
|
||||
if not encoding:
|
||||
# Need to detect UTF-32 before UTF-16
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
if encoding:
|
||||
self.rawStream.seek(seek)
|
||||
return lookupEncoding(encoding)
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
return None
|
||||
|
||||
def detectEncodingMeta(self):
|
||||
"""Report the encoding declared by the meta element
|
||||
"""
|
||||
buffer = self.rawStream.read(self.numBytesMeta)
|
||||
assert isinstance(buffer, bytes)
|
||||
parser = EncodingParser(buffer)
|
||||
self.rawStream.seek(0)
|
||||
encoding = parser.getEncoding()
|
||||
|
||||
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
|
||||
encoding = lookupEncoding("utf-8")
|
||||
|
||||
return encoding
|
||||
|
||||
|
||||
class EncodingBytes(bytes):
|
||||
"""String-like object with an associated position and various extra methods
|
||||
If the position is ever greater than the string length then an exception is
|
||||
raised"""
|
||||
def __new__(self, value):
|
||||
assert isinstance(value, bytes)
|
||||
return bytes.__new__(self, value.lower())
|
||||
|
||||
def __init__(self, value):
|
||||
# pylint:disable=unused-argument
|
||||
self._position = -1
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
p = self._position = self._position + 1
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
return self[p:p + 1]
|
||||
|
||||
def next(self):
|
||||
# Py2 compat
|
||||
return self.__next__()
|
||||
|
||||
def previous(self):
|
||||
p = self._position
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
self._position = p = p - 1
|
||||
return self[p:p + 1]
|
||||
|
||||
def setPosition(self, position):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
self._position = position
|
||||
|
||||
def getPosition(self):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
if self._position >= 0:
|
||||
return self._position
|
||||
else:
|
||||
return None
|
||||
|
||||
position = property(getPosition, setPosition)
|
||||
|
||||
def getCurrentByte(self):
|
||||
return self[self.position:self.position + 1]
|
||||
|
||||
currentByte = property(getCurrentByte)
|
||||
|
||||
def skip(self, chars=spaceCharactersBytes):
|
||||
"""Skip past a list of characters"""
|
||||
p = self.position # use property for the error-checking
|
||||
while p < len(self):
|
||||
c = self[p:p + 1]
|
||||
if c not in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def skipUntil(self, chars):
|
||||
p = self.position
|
||||
while p < len(self):
|
||||
c = self[p:p + 1]
|
||||
if c in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def matchBytes(self, bytes):
|
||||
"""Look for a sequence of bytes at the start of a string. If the bytes
|
||||
are found return True and advance the position to the byte after the
|
||||
match. Otherwise return False and leave the position alone"""
|
||||
rv = self.startswith(bytes, self.position)
|
||||
if rv:
|
||||
self.position += len(bytes)
|
||||
return rv
|
||||
|
||||
def jumpTo(self, bytes):
|
||||
"""Look for the next sequence of bytes matching a given sequence. If
|
||||
a match is found advance the position to the last byte of the match"""
|
||||
try:
|
||||
self._position = self.index(bytes, self.position) + len(bytes) - 1
|
||||
except ValueError:
|
||||
raise StopIteration
|
||||
return True
|
||||
|
||||
|
||||
class EncodingParser(object):
|
||||
"""Mini parser for detecting character encoding from meta elements"""
|
||||
|
||||
def __init__(self, data):
|
||||
"""string - the data to work on for encoding detection"""
|
||||
self.data = EncodingBytes(data)
|
||||
self.encoding = None
|
||||
|
||||
def getEncoding(self):
|
||||
if b"<meta" not in self.data:
|
||||
return None
|
||||
|
||||
methodDispatch = (
|
||||
(b"<!--", self.handleComment),
|
||||
(b"<meta", self.handleMeta),
|
||||
(b"</", self.handlePossibleEndTag),
|
||||
(b"<!", self.handleOther),
|
||||
(b"<?", self.handleOther),
|
||||
(b"<", self.handlePossibleStartTag))
|
||||
for _ in self.data:
|
||||
keepParsing = True
|
||||
try:
|
||||
self.data.jumpTo(b"<")
|
||||
except StopIteration:
|
||||
break
|
||||
for key, method in methodDispatch:
|
||||
if self.data.matchBytes(key):
|
||||
try:
|
||||
keepParsing = method()
|
||||
break
|
||||
except StopIteration:
|
||||
keepParsing = False
|
||||
break
|
||||
if not keepParsing:
|
||||
break
|
||||
|
||||
return self.encoding
|
||||
|
||||
def handleComment(self):
|
||||
"""Skip over comments"""
|
||||
return self.data.jumpTo(b"-->")
|
||||
|
||||
def handleMeta(self):
|
||||
if self.data.currentByte not in spaceCharactersBytes:
|
||||
# if we have <meta not followed by a space so just keep going
|
||||
return True
|
||||
# We have a valid meta element we want to search for attributes
|
||||
hasPragma = False
|
||||
pendingEncoding = None
|
||||
while True:
|
||||
# Try to find the next attribute after the current position
|
||||
attr = self.getAttribute()
|
||||
if attr is None:
|
||||
return True
|
||||
else:
|
||||
if attr[0] == b"http-equiv":
|
||||
hasPragma = attr[1] == b"content-type"
|
||||
if hasPragma and pendingEncoding is not None:
|
||||
self.encoding = pendingEncoding
|
||||
return False
|
||||
elif attr[0] == b"charset":
|
||||
tentativeEncoding = attr[1]
|
||||
codec = lookupEncoding(tentativeEncoding)
|
||||
if codec is not None:
|
||||
self.encoding = codec
|
||||
return False
|
||||
elif attr[0] == b"content":
|
||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||
tentativeEncoding = contentParser.parse()
|
||||
if tentativeEncoding is not None:
|
||||
codec = lookupEncoding(tentativeEncoding)
|
||||
if codec is not None:
|
||||
if hasPragma:
|
||||
self.encoding = codec
|
||||
return False
|
||||
else:
|
||||
pendingEncoding = codec
|
||||
|
||||
def handlePossibleStartTag(self):
|
||||
return self.handlePossibleTag(False)
|
||||
|
||||
def handlePossibleEndTag(self):
|
||||
next(self.data)
|
||||
return self.handlePossibleTag(True)
|
||||
|
||||
def handlePossibleTag(self, endTag):
|
||||
data = self.data
|
||||
if data.currentByte not in asciiLettersBytes:
|
||||
# If the next byte is not an ascii letter either ignore this
|
||||
# fragment (possible start tag case) or treat it according to
|
||||
# handleOther
|
||||
if endTag:
|
||||
data.previous()
|
||||
self.handleOther()
|
||||
return True
|
||||
|
||||
c = data.skipUntil(spacesAngleBrackets)
|
||||
if c == b"<":
|
||||
# return to the first step in the overall "two step" algorithm
|
||||
# reprocessing the < byte
|
||||
data.previous()
|
||||
else:
|
||||
# Read all attributes
|
||||
attr = self.getAttribute()
|
||||
while attr is not None:
|
||||
attr = self.getAttribute()
|
||||
return True
|
||||
|
||||
def handleOther(self):
|
||||
return self.data.jumpTo(b">")
|
||||
|
||||
def getAttribute(self):
|
||||
"""Return a name,value pair for the next attribute in the stream,
|
||||
if one is found, or None"""
|
||||
data = self.data
|
||||
# Step 1 (skip chars)
|
||||
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
|
||||
assert c is None or len(c) == 1
|
||||
# Step 2
|
||||
if c in (b">", None):
|
||||
return None
|
||||
# Step 3
|
||||
attrName = []
|
||||
attrValue = []
|
||||
# Step 4 attribute name
|
||||
while True:
|
||||
if c == b"=" and attrName:
|
||||
break
|
||||
elif c in spaceCharactersBytes:
|
||||
# Step 6!
|
||||
c = data.skip()
|
||||
break
|
||||
elif c in (b"/", b">"):
|
||||
return b"".join(attrName), b""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrName.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrName.append(c)
|
||||
# Step 5
|
||||
c = next(data)
|
||||
# Step 7
|
||||
if c != b"=":
|
||||
data.previous()
|
||||
return b"".join(attrName), b""
|
||||
# Step 8
|
||||
next(data)
|
||||
# Step 9
|
||||
c = data.skip()
|
||||
# Step 10
|
||||
if c in (b"'", b'"'):
|
||||
# 10.1
|
||||
quoteChar = c
|
||||
while True:
|
||||
# 10.2
|
||||
c = next(data)
|
||||
# 10.3
|
||||
if c == quoteChar:
|
||||
next(data)
|
||||
return b"".join(attrName), b"".join(attrValue)
|
||||
# 10.4
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
# 10.5
|
||||
else:
|
||||
attrValue.append(c)
|
||||
elif c == b">":
|
||||
return b"".join(attrName), b""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
# Step 11
|
||||
while True:
|
||||
c = next(data)
|
||||
if c in spacesAngleBrackets:
|
||||
return b"".join(attrName), b"".join(attrValue)
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
|
||||
|
||||
class ContentAttrParser(object):
|
||||
def __init__(self, data):
|
||||
assert isinstance(data, bytes)
|
||||
self.data = data
|
||||
|
||||
def parse(self):
|
||||
try:
|
||||
# Check if the attr name is charset
|
||||
# otherwise return
|
||||
self.data.jumpTo(b"charset")
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
if not self.data.currentByte == b"=":
|
||||
# If there is no = sign keep looking for attrs
|
||||
return None
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
# Look for an encoding between matching quote marks
|
||||
if self.data.currentByte in (b'"', b"'"):
|
||||
quoteMark = self.data.currentByte
|
||||
self.data.position += 1
|
||||
oldPosition = self.data.position
|
||||
if self.data.jumpTo(quoteMark):
|
||||
return self.data[oldPosition:self.data.position]
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
# Unquoted value
|
||||
oldPosition = self.data.position
|
||||
try:
|
||||
self.data.skipUntil(spaceCharactersBytes)
|
||||
return self.data[oldPosition:self.data.position]
|
||||
except StopIteration:
|
||||
# Return the whole remaining value
|
||||
return self.data[oldPosition:]
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
|
||||
def lookupEncoding(encoding):
|
||||
"""Return the python codec name corresponding to an encoding or None if the
|
||||
string doesn't correspond to a valid encoding."""
|
||||
if isinstance(encoding, bytes):
|
||||
try:
|
||||
encoding = encoding.decode("ascii")
|
||||
except UnicodeDecodeError:
|
||||
return None
|
||||
|
||||
if encoding is not None:
|
||||
try:
|
||||
return webencodings.lookup(encoding)
|
||||
except AttributeError:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,5 +0,0 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .py import Trie
|
||||
|
||||
__all__ = ["Trie"]
|
||||
@@ -1,40 +0,0 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
try:
|
||||
from collections.abc import Mapping
|
||||
except ImportError: # Python 2.7
|
||||
from collections import Mapping
|
||||
|
||||
|
||||
class Trie(Mapping):
|
||||
"""Abstract base class for tries"""
|
||||
|
||||
def keys(self, prefix=None):
|
||||
# pylint:disable=arguments-differ
|
||||
keys = super(Trie, self).keys()
|
||||
|
||||
if prefix is None:
|
||||
return set(keys)
|
||||
|
||||
return {x for x in keys if x.startswith(prefix)}
|
||||
|
||||
def has_keys_with_prefix(self, prefix):
|
||||
for key in self.keys():
|
||||
if key.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def longest_prefix(self, prefix):
|
||||
if prefix in self:
|
||||
return prefix
|
||||
|
||||
for i in range(1, len(prefix) + 1):
|
||||
if prefix[:-i] in self:
|
||||
return prefix[:-i]
|
||||
|
||||
raise KeyError(prefix)
|
||||
|
||||
def longest_prefix_item(self, prefix):
|
||||
lprefix = self.longest_prefix(prefix)
|
||||
return (lprefix, self[lprefix])
|
||||
@@ -1,67 +0,0 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
from pip._vendor.six import text_type
|
||||
|
||||
from bisect import bisect_left
|
||||
|
||||
from ._base import Trie as ABCTrie
|
||||
|
||||
|
||||
class Trie(ABCTrie):
|
||||
def __init__(self, data):
|
||||
if not all(isinstance(x, text_type) for x in data.keys()):
|
||||
raise TypeError("All keys must be strings")
|
||||
|
||||
self._data = data
|
||||
self._keys = sorted(data.keys())
|
||||
self._cachestr = ""
|
||||
self._cachepoints = (0, len(data))
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self._data
|
||||
|
||||
def __len__(self):
|
||||
return len(self._data)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._data)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self._data[key]
|
||||
|
||||
def keys(self, prefix=None):
|
||||
if prefix is None or prefix == "" or not self._keys:
|
||||
return set(self._keys)
|
||||
|
||||
if prefix.startswith(self._cachestr):
|
||||
lo, hi = self._cachepoints
|
||||
start = i = bisect_left(self._keys, prefix, lo, hi)
|
||||
else:
|
||||
start = i = bisect_left(self._keys, prefix)
|
||||
|
||||
keys = set()
|
||||
if start == len(self._keys):
|
||||
return keys
|
||||
|
||||
while self._keys[i].startswith(prefix):
|
||||
keys.add(self._keys[i])
|
||||
i += 1
|
||||
|
||||
self._cachestr = prefix
|
||||
self._cachepoints = (start, i)
|
||||
|
||||
return keys
|
||||
|
||||
def has_keys_with_prefix(self, prefix):
|
||||
if prefix in self._data:
|
||||
return True
|
||||
|
||||
if prefix.startswith(self._cachestr):
|
||||
lo, hi = self._cachepoints
|
||||
i = bisect_left(self._keys, prefix, lo, hi)
|
||||
else:
|
||||
i = bisect_left(self._keys, prefix)
|
||||
|
||||
if i == len(self._keys):
|
||||
return False
|
||||
|
||||
return self._keys[i].startswith(prefix)
|
||||
@@ -1,159 +0,0 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from types import ModuleType
|
||||
|
||||
try:
|
||||
from collections.abc import Mapping
|
||||
except ImportError:
|
||||
from collections import Mapping
|
||||
|
||||
from pip._vendor.six import text_type, PY3
|
||||
|
||||
if PY3:
|
||||
import xml.etree.ElementTree as default_etree
|
||||
else:
|
||||
try:
|
||||
import xml.etree.cElementTree as default_etree
|
||||
except ImportError:
|
||||
import xml.etree.ElementTree as default_etree
|
||||
|
||||
|
||||
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
||||
"surrogatePairToCodepoint", "moduleFactoryFactory",
|
||||
"supports_lone_surrogates"]
|
||||
|
||||
|
||||
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
|
||||
# caught by the below test. In general this would be any platform
|
||||
# using UTF-16 as its encoding of unicode strings, such as
|
||||
# Jython. This is because UTF-16 itself is based on the use of such
|
||||
# surrogates, and there is no mechanism to further escape such
|
||||
# escapes.
|
||||
try:
|
||||
_x = eval('"\\uD800"') # pylint:disable=eval-used
|
||||
if not isinstance(_x, text_type):
|
||||
# We need this with u"" because of http://bugs.jython.org/issue2039
|
||||
_x = eval('u"\\uD800"') # pylint:disable=eval-used
|
||||
assert isinstance(_x, text_type)
|
||||
except Exception:
|
||||
supports_lone_surrogates = False
|
||||
else:
|
||||
supports_lone_surrogates = True
|
||||
|
||||
|
||||
class MethodDispatcher(dict):
|
||||
"""Dict with 2 special properties:
|
||||
|
||||
On initiation, keys that are lists, sets or tuples are converted to
|
||||
multiple keys so accessing any one of the items in the original
|
||||
list-like object returns the matching value
|
||||
|
||||
md = MethodDispatcher({("foo", "bar"):"baz"})
|
||||
md["foo"] == "baz"
|
||||
|
||||
A default value which can be set through the default attribute.
|
||||
"""
|
||||
|
||||
def __init__(self, items=()):
|
||||
_dictEntries = []
|
||||
for name, value in items:
|
||||
if isinstance(name, (list, tuple, frozenset, set)):
|
||||
for item in name:
|
||||
_dictEntries.append((item, value))
|
||||
else:
|
||||
_dictEntries.append((name, value))
|
||||
dict.__init__(self, _dictEntries)
|
||||
assert len(self) == len(_dictEntries)
|
||||
self.default = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return dict.get(self, key, self.default)
|
||||
|
||||
def __get__(self, instance, owner=None):
|
||||
return BoundMethodDispatcher(instance, self)
|
||||
|
||||
|
||||
class BoundMethodDispatcher(Mapping):
|
||||
"""Wraps a MethodDispatcher, binding its return values to `instance`"""
|
||||
def __init__(self, instance, dispatcher):
|
||||
self.instance = instance
|
||||
self.dispatcher = dispatcher
|
||||
|
||||
def __getitem__(self, key):
|
||||
# see https://docs.python.org/3/reference/datamodel.html#object.__get__
|
||||
# on a function, __get__ is used to bind a function to an instance as a bound method
|
||||
return self.dispatcher[key].__get__(self.instance)
|
||||
|
||||
def get(self, key, default):
|
||||
if key in self.dispatcher:
|
||||
return self[key]
|
||||
else:
|
||||
return default
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.dispatcher)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dispatcher)
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.dispatcher
|
||||
|
||||
|
||||
# Some utility functions to deal with weirdness around UCS2 vs UCS4
|
||||
# python builds
|
||||
|
||||
def isSurrogatePair(data):
|
||||
return (len(data) == 2 and
|
||||
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
|
||||
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
|
||||
|
||||
|
||||
def surrogatePairToCodepoint(data):
|
||||
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
|
||||
(ord(data[1]) - 0xDC00))
|
||||
return char_val
|
||||
|
||||
# Module Factory Factory (no, this isn't Java, I know)
|
||||
# Here to stop this being duplicated all over the place.
|
||||
|
||||
|
||||
def moduleFactoryFactory(factory):
|
||||
moduleCache = {}
|
||||
|
||||
def moduleFactory(baseModule, *args, **kwargs):
|
||||
if isinstance(ModuleType.__name__, type("")):
|
||||
name = "_%s_factory" % baseModule.__name__
|
||||
else:
|
||||
name = b"_%s_factory" % baseModule.__name__
|
||||
|
||||
kwargs_tuple = tuple(kwargs.items())
|
||||
|
||||
try:
|
||||
return moduleCache[name][args][kwargs_tuple]
|
||||
except KeyError:
|
||||
mod = ModuleType(name)
|
||||
objs = factory(baseModule, *args, **kwargs)
|
||||
mod.__dict__.update(objs)
|
||||
if "name" not in moduleCache:
|
||||
moduleCache[name] = {}
|
||||
if "args" not in moduleCache[name]:
|
||||
moduleCache[name][args] = {}
|
||||
if "kwargs" not in moduleCache[name][args]:
|
||||
moduleCache[name][args][kwargs_tuple] = {}
|
||||
moduleCache[name][args][kwargs_tuple] = mod
|
||||
return mod
|
||||
|
||||
return moduleFactory
|
||||
|
||||
|
||||
def memoize(func):
|
||||
cache = {}
|
||||
|
||||
def wrapped(*args, **kwargs):
|
||||
key = (tuple(args), tuple(kwargs.items()))
|
||||
if key not in cache:
|
||||
cache[key] = func(*args, **kwargs)
|
||||
return cache[key]
|
||||
|
||||
return wrapped
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,29 +0,0 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import base
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
def _attr_key(attr):
|
||||
"""Return an appropriate key for an attribute for sorting
|
||||
|
||||
Attributes have a namespace that can be either ``None`` or a string. We
|
||||
can't compare the two because they're different types, so we convert
|
||||
``None`` to an empty string first.
|
||||
|
||||
"""
|
||||
return (attr[0][0] or ''), attr[0][1]
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Alphabetizes attributes for elements"""
|
||||
def __iter__(self):
|
||||
for token in base.Filter.__iter__(self):
|
||||
if token["type"] in ("StartTag", "EmptyTag"):
|
||||
attrs = OrderedDict()
|
||||
for name, value in sorted(token["data"].items(),
|
||||
key=_attr_key):
|
||||
attrs[name] = value
|
||||
token["data"] = attrs
|
||||
yield token
|
||||
@@ -1,12 +0,0 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
|
||||
class Filter(object):
|
||||
def __init__(self, source):
|
||||
self.source = source
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.source)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.source, name)
|
||||
@@ -1,73 +0,0 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import base
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
"""Injects ``<meta charset=ENCODING>`` tag into head of document"""
|
||||
def __init__(self, source, encoding):
|
||||
"""Creates a Filter
|
||||
|
||||
:arg source: the source token stream
|
||||
|
||||
:arg encoding: the encoding to set
|
||||
|
||||
"""
|
||||
base.Filter.__init__(self, source)
|
||||
self.encoding = encoding
|
||||
|
||||
def __iter__(self):
|
||||
state = "pre_head"
|
||||
meta_found = (self.encoding is None)
|
||||
pending = []
|
||||
|
||||
for token in base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["name"].lower() == "head":
|
||||
state = "in_head"
|
||||
|
||||
elif type == "EmptyTag":
|
||||
if token["name"].lower() == "meta":
|
||||
# replace charset with actual encoding
|
||||
has_http_equiv_content_type = False
|
||||
for (namespace, name), value in token["data"].items():
|
||||
if namespace is not None:
|
||||
continue
|
||||
elif name.lower() == 'charset':
|
||||
token["data"][(namespace, name)] = self.encoding
|
||||
meta_found = True
|
||||
break
|
||||
elif name == 'http-equiv' and value.lower() == 'content-type':
|
||||
has_http_equiv_content_type = True
|
||||
else:
|
||||
if has_http_equiv_content_type and (None, "content") in token["data"]:
|
||||
token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
|
||||
meta_found = True
|
||||
|
||||
elif token["name"].lower() == "head" and not meta_found:
|
||||
# insert meta into empty head
|
||||
yield {"type": "StartTag", "name": "head",
|
||||
"data": token["data"]}
|
||||
yield {"type": "EmptyTag", "name": "meta",
|
||||
"data": {(None, "charset"): self.encoding}}
|
||||
yield {"type": "EndTag", "name": "head"}
|
||||
meta_found = True
|
||||
continue
|
||||
|
||||
elif type == "EndTag":
|
||||
if token["name"].lower() == "head" and pending:
|
||||
# insert meta into head (if necessary) and flush pending queue
|
||||
yield pending.pop(0)
|
||||
if not meta_found:
|
||||
yield {"type": "EmptyTag", "name": "meta",
|
||||
"data": {(None, "charset"): self.encoding}}
|
||||
while pending:
|
||||
yield pending.pop(0)
|
||||
meta_found = True
|
||||
state = "post_head"
|
||||
|
||||
if state == "in_head":
|
||||
pending.append(token)
|
||||
else:
|
||||
yield token
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user