This commit is contained in:
Ambulance Clerc
2023-06-01 08:59:37 +02:00
parent 1fe8228d1b
commit 796746d175
346 changed files with 18799 additions and 44645 deletions

View File

@@ -28,54 +28,62 @@
import logging
import re
from typing import Optional, Union
from .enums import ProbingState
from .enums import LanguageFilter, ProbingState
INTERNATIONAL_WORDS_PATTERN = re.compile(
b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
)
class CharSetProber(object):
class CharSetProber:
SHORTCUT_THRESHOLD = 0.95
def __init__(self, lang_filter=None):
self._state = None
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
self._state = ProbingState.DETECTING
self.active = True
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
def reset(self):
def reset(self) -> None:
self._state = ProbingState.DETECTING
@property
def charset_name(self):
def charset_name(self) -> Optional[str]:
return None
def feed(self, buf):
pass
@property
def language(self) -> Optional[str]:
raise NotImplementedError
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
raise NotImplementedError
@property
def state(self):
def state(self) -> ProbingState:
return self._state
def get_confidence(self):
def get_confidence(self) -> float:
return 0.0
@staticmethod
def filter_high_byte_only(buf):
buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
return buf
@staticmethod
def filter_international_words(buf):
def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
"""
We define three types of bytes:
alphabet: english alphabets [a-zA-Z]
international: international characters [\x80-\xFF]
marker: everything else [^a-zA-Z\x80-\xFF]
The input buffer can be thought to contain a series of words delimited
by markers. This function works to filter all words that contain at
least one international character. All contiguous sequences of markers
are replaced by a single space ascii character.
This filter applies to all scripts which do not use English characters.
"""
filtered = bytearray()
@@ -83,8 +91,7 @@ class CharSetProber(object):
# This regex expression filters out only words that have at-least one
# international character. The word may include one marker character at
# the end.
words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
buf)
words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
for word in words:
filtered.extend(word[:-1])
@@ -94,20 +101,17 @@ class CharSetProber(object):
# similarly across all languages and may thus have similar
# frequencies).
last_char = word[-1:]
if not last_char.isalpha() and last_char < b'\x80':
last_char = b' '
if not last_char.isalpha() and last_char < b"\x80":
last_char = b" "
filtered.extend(last_char)
return filtered
@staticmethod
def filter_with_english_letters(buf):
def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
"""
Returns a copy of ``buf`` that retains only the sequences of English
alphabet and high byte characters that are not between <> characters.
Also retains English alphabet and high byte characters immediately
before occurrences of >.
This filter can be applied to all scripts which contain both English
characters and extended ASCII characters, but is currently only used by
``Latin1Prober``.
@@ -115,26 +119,24 @@ class CharSetProber(object):
filtered = bytearray()
in_tag = False
prev = 0
buf = memoryview(buf).cast("c")
for curr in range(len(buf)):
# Slice here to get bytes instead of an int with Python 3
buf_char = buf[curr:curr + 1]
# Check if we're coming out of or entering an HTML tag
if buf_char == b'>':
for curr, buf_char in enumerate(buf):
# Check if we're coming out of or entering an XML tag
# https://github.com/python/typeshed/issues/8182
if buf_char == b">": # type: ignore[comparison-overlap]
prev = curr + 1
in_tag = False
elif buf_char == b'<':
in_tag = True
# If current character is not extended-ASCII and not alphabetic...
if buf_char < b'\x80' and not buf_char.isalpha():
# ...and we're not in a tag
# https://github.com/python/typeshed/issues/8182
elif buf_char == b"<": # type: ignore[comparison-overlap]
if curr > prev and not in_tag:
# Keep everything after last non-extended-ASCII,
# non-alphabetic character
filtered.extend(buf[prev:curr])
# Output a space to delimit stretch we kept
filtered.extend(b' ')
prev = curr + 1
filtered.extend(b" ")
in_tag = True
# If we're not in a tag...
if not in_tag: