Change venv

This commit is contained in:
Ambulance Clerc
2023-05-31 08:31:22 +02:00
parent fb6f579089
commit fdbb52c96f
466 changed files with 25899 additions and 64721 deletions

View File

@@ -25,8 +25,11 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Optional, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
from .sbcharsetprober import SingleByteCharSetProber
# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
@@ -125,18 +128,20 @@ from .enums import ProbingState
# model probers scores. The answer is returned in the form of the name of the
# charset identified, either "windows-1255" or "ISO-8859-8".
class HebrewProber(CharSetProber):
SPACE = 0x20
# windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = 0xea
NORMAL_KAF = 0xeb
FINAL_MEM = 0xed
NORMAL_MEM = 0xee
FINAL_NUN = 0xef
NORMAL_NUN = 0xf0
FINAL_PE = 0xf3
NORMAL_PE = 0xf4
FINAL_TSADI = 0xf5
NORMAL_TSADI = 0xf6
FINAL_KAF = 0xEA
NORMAL_KAF = 0xEB
FINAL_MEM = 0xED
NORMAL_MEM = 0xEE
FINAL_NUN = 0xEF
NORMAL_NUN = 0xF0
FINAL_PE = 0xF3
NORMAL_PE = 0xF4
FINAL_TSADI = 0xF5
NORMAL_TSADI = 0xF6
# Minimum Visual vs Logical final letter score difference.
# If the difference is below this, don't rely solely on the final letter score
@@ -151,35 +156,44 @@ class HebrewProber(CharSetProber):
VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255"
def __init__(self):
super(HebrewProber, self).__init__()
self._final_char_logical_score = None
self._final_char_visual_score = None
self._prev = None
self._before_prev = None
self._logical_prober = None
self._visual_prober = None
def __init__(self) -> None:
super().__init__()
self._final_char_logical_score = 0
self._final_char_visual_score = 0
self._prev = self.SPACE
self._before_prev = self.SPACE
self._logical_prober: Optional[SingleByteCharSetProber] = None
self._visual_prober: Optional[SingleByteCharSetProber] = None
self.reset()
def reset(self):
def reset(self) -> None:
self._final_char_logical_score = 0
self._final_char_visual_score = 0
# The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate
# a word delimiter at the beginning of the data
self._prev = ' '
self._before_prev = ' '
self._prev = self.SPACE
self._before_prev = self.SPACE
# These probers are owned by the group prober.
def set_model_probers(self, logicalProber, visualProber):
self._logical_prober = logicalProber
self._visual_prober = visualProber
def set_model_probers(
self,
logical_prober: SingleByteCharSetProber,
visual_prober: SingleByteCharSetProber,
) -> None:
self._logical_prober = logical_prober
self._visual_prober = visual_prober
def is_final(self, c):
return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
self.FINAL_PE, self.FINAL_TSADI]
def is_final(self, c: int) -> bool:
return c in [
self.FINAL_KAF,
self.FINAL_MEM,
self.FINAL_NUN,
self.FINAL_PE,
self.FINAL_TSADI,
]
def is_non_final(self, c):
def is_non_final(self, c: int) -> bool:
# The normal Tsadi is not a good Non-Final letter due to words like
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
# apostrophe is converted to a space in FilterWithoutEnglishLetters
@@ -190,10 +204,9 @@ class HebrewProber(CharSetProber):
# for example legally end with a Non-Final Pe or Kaf. However, the
# benefit of these letters as Non-Final letters outweighs the damage
# since these words are quite rare.
return c in [self.NORMAL_KAF, self.NORMAL_MEM,
self.NORMAL_NUN, self.NORMAL_PE]
return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
# Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew
# or visual Hebrew.
@@ -227,9 +240,9 @@ class HebrewProber(CharSetProber):
byte_str = self.filter_high_byte_only(byte_str)
for cur in byte_str:
if cur == ' ':
if cur == self.SPACE:
# We stand on a space - a word just ended
if self._before_prev != ' ':
if self._before_prev != self.SPACE:
# next-to-last char was not a space so self._prev is not a
# 1 letter word
if self.is_final(self._prev):
@@ -241,8 +254,11 @@ class HebrewProber(CharSetProber):
self._final_char_visual_score += 1
else:
# Not standing on a space
if ((self._before_prev == ' ') and
(self.is_final(self._prev)) and (cur != ' ')):
if (
(self._before_prev == self.SPACE)
and (self.is_final(self._prev))
and (cur != self.SPACE)
):
# case (3) [-2:space][-1:final letter][cur:not space]
self._final_char_visual_score += 1
self._before_prev = self._prev
@@ -253,7 +269,10 @@ class HebrewProber(CharSetProber):
return ProbingState.DETECTING
@property
def charset_name(self):
def charset_name(self) -> str:
assert self._logical_prober is not None
assert self._visual_prober is not None
# Make the decision: is it Logical or Visual?
# If the final letter score distance is dominant enough, rely on it.
finalsub = self._final_char_logical_score - self._final_char_visual_score
@@ -263,8 +282,9 @@ class HebrewProber(CharSetProber):
return self.VISUAL_HEBREW_NAME
# It's not dominant enough, try to rely on the model scores instead.
modelsub = (self._logical_prober.get_confidence()
- self._visual_prober.get_confidence())
modelsub = (
self._logical_prober.get_confidence() - self._visual_prober.get_confidence()
)
if modelsub > self.MIN_MODEL_DISTANCE:
return self.LOGICAL_HEBREW_NAME
if modelsub < -self.MIN_MODEL_DISTANCE:
@@ -280,13 +300,17 @@ class HebrewProber(CharSetProber):
return self.LOGICAL_HEBREW_NAME
@property
def language(self):
return 'Hebrew'
def language(self) -> str:
return "Hebrew"
@property
def state(self):
def state(self) -> ProbingState:
assert self._logical_prober is not None
assert self._visual_prober is not None
# Remain active as long as any of the model probers are active.
if (self._logical_prober.state == ProbingState.NOT_ME) and \
(self._visual_prober.state == ProbingState.NOT_ME):
if (self._logical_prober.state == ProbingState.NOT_ME) and (
self._visual_prober.state == ProbingState.NOT_ME
):
return ProbingState.NOT_ME
return ProbingState.DETECTING