Change venv
This commit is contained in:
@@ -25,45 +25,46 @@
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState, MachineState
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .mbcssm import UTF8_SM_MODEL
|
||||
from typing import Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import MachineState, ProbingState
|
||||
from .mbcssm import UTF8_SM_MODEL
|
||||
|
||||
|
||||
class UTF8Prober(CharSetProber):
|
||||
ONE_CHAR_PROB = 0.5
|
||||
|
||||
def __init__(self):
|
||||
super(UTF8Prober, self).__init__()
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
|
||||
self._num_mb_chars = None
|
||||
self._num_mb_chars = 0
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
super(UTF8Prober, self).reset()
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self.coding_sm.reset()
|
||||
self._num_mb_chars = 0
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "utf-8"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return ""
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
for c in byte_str:
|
||||
coding_state = self.coding_sm.next_state(c)
|
||||
if coding_state == MachineState.ERROR:
|
||||
self._state = ProbingState.NOT_ME
|
||||
break
|
||||
elif coding_state == MachineState.ITS_ME:
|
||||
if coding_state == MachineState.ITS_ME:
|
||||
self._state = ProbingState.FOUND_IT
|
||||
break
|
||||
elif coding_state == MachineState.START:
|
||||
if coding_state == MachineState.START:
|
||||
if self.coding_sm.get_current_charlen() >= 2:
|
||||
self._num_mb_chars += 1
|
||||
|
||||
@@ -73,10 +74,9 @@ class UTF8Prober(CharSetProber):
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
unlike = 0.99
|
||||
if self._num_mb_chars < 6:
|
||||
unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars
|
||||
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars
|
||||
return 1.0 - unlike
|
||||
else:
|
||||
return unlike
|
||||
return unlike
|
||||
|
Reference in New Issue
Block a user