Change venv
This commit is contained in:
@@ -1,19 +1,17 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Metadata about languages used by our model training code for our
|
||||
SingleByteCharSetProbers. Could be used for other things in the future.
|
||||
|
||||
This code is based on the language metadata from the uchardet project.
|
||||
"""
|
||||
from __future__ import absolute_import, print_function
|
||||
|
||||
from string import ascii_letters
|
||||
from typing import List, Optional
|
||||
|
||||
# TODO: Add Ukrainian (KOI8-U)
|
||||
|
||||
|
||||
# TODO: Add Ukranian (KOI8-U)
|
||||
|
||||
class Language(object):
|
||||
class Language:
|
||||
"""Metadata about a language useful for training models
|
||||
|
||||
:ivar name: The human name for the language, in English.
|
||||
@@ -33,9 +31,17 @@ class Language(object):
|
||||
Wikipedia for training data.
|
||||
:type wiki_start_pages: list of str
|
||||
"""
|
||||
def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
|
||||
alphabet=None, wiki_start_pages=None):
|
||||
super(Language, self).__init__()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: Optional[str] = None,
|
||||
iso_code: Optional[str] = None,
|
||||
use_ascii: bool = True,
|
||||
charsets: Optional[List[str]] = None,
|
||||
alphabet: Optional[str] = None,
|
||||
wiki_start_pages: Optional[List[str]] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.name = name
|
||||
self.iso_code = iso_code
|
||||
self.use_ascii = use_ascii
|
||||
@@ -46,265 +52,301 @@ class Language(object):
|
||||
else:
|
||||
alphabet = ascii_letters
|
||||
elif not alphabet:
|
||||
raise ValueError('Must supply alphabet if use_ascii is False')
|
||||
self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
|
||||
raise ValueError("Must supply alphabet if use_ascii is False")
|
||||
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
|
||||
self.wiki_start_pages = wiki_start_pages
|
||||
|
||||
def __repr__(self):
|
||||
return '{}({})'.format(self.__class__.__name__,
|
||||
', '.join('{}={!r}'.format(k, v)
|
||||
for k, v in self.__dict__.items()
|
||||
if not k.startswith('_')))
|
||||
def __repr__(self) -> str:
|
||||
param_str = ", ".join(
|
||||
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
||||
)
|
||||
return f"{self.__class__.__name__}({param_str})"
|
||||
|
||||
|
||||
LANGUAGES = {'Arabic': Language(name='Arabic',
|
||||
iso_code='ar',
|
||||
use_ascii=False,
|
||||
# We only support encodings that use isolated
|
||||
# forms, because the current recommendation is
|
||||
# that the rendering system handles presentation
|
||||
# forms. This means we purposefully skip IBM864.
|
||||
charsets=['ISO-8859-6', 'WINDOWS-1256',
|
||||
'CP720', 'CP864'],
|
||||
alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
|
||||
wiki_start_pages=[u'الصفحة_الرئيسية']),
|
||||
'Belarusian': Language(name='Belarusian',
|
||||
iso_code='be',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'IBM866', 'MacCyrillic'],
|
||||
alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
|
||||
u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
|
||||
wiki_start_pages=[u'Галоўная_старонка']),
|
||||
'Bulgarian': Language(name='Bulgarian',
|
||||
iso_code='bg',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'IBM855'],
|
||||
alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
|
||||
u'абвгдежзийклмнопрстуфхцчшщъьюя'),
|
||||
wiki_start_pages=[u'Начална_страница']),
|
||||
'Czech': Language(name='Czech',
|
||||
iso_code='cz',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
|
||||
wiki_start_pages=[u'Hlavní_strana']),
|
||||
'Danish': Language(name='Danish',
|
||||
iso_code='da',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'æøåÆØÅ',
|
||||
wiki_start_pages=[u'Forside']),
|
||||
'German': Language(name='German',
|
||||
iso_code='de',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||
alphabet=u'äöüßÄÖÜ',
|
||||
wiki_start_pages=[u'Wikipedia:Hauptseite']),
|
||||
'Greek': Language(name='Greek',
|
||||
iso_code='el',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-7', 'WINDOWS-1253'],
|
||||
alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
|
||||
u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
|
||||
wiki_start_pages=[u'Πύλη:Κύρια']),
|
||||
'English': Language(name='English',
|
||||
iso_code='en',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||
wiki_start_pages=[u'Main_Page']),
|
||||
'Esperanto': Language(name='Esperanto',
|
||||
iso_code='eo',
|
||||
# Q, W, X, and Y not used at all
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-3'],
|
||||
alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
|
||||
u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
|
||||
wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
|
||||
'Spanish': Language(name='Spanish',
|
||||
iso_code='es',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
|
||||
wiki_start_pages=[u'Wikipedia:Portada']),
|
||||
'Estonian': Language(name='Estonian',
|
||||
iso_code='et',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-4', 'ISO-8859-13',
|
||||
'WINDOWS-1257'],
|
||||
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
||||
# loanwords
|
||||
alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
|
||||
u'abdeghijklmnoprstuvõäöü'),
|
||||
wiki_start_pages=[u'Esileht']),
|
||||
'Finnish': Language(name='Finnish',
|
||||
iso_code='fi',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ÅÄÖŠŽåäöšž',
|
||||
wiki_start_pages=[u'Wikipedia:Etusivu']),
|
||||
'French': Language(name='French',
|
||||
iso_code='fr',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
|
||||
wiki_start_pages=[u'Wikipédia:Accueil_principal',
|
||||
u'Bœuf (animal)']),
|
||||
'Hebrew': Language(name='Hebrew',
|
||||
iso_code='he',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-8', 'WINDOWS-1255'],
|
||||
alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
|
||||
wiki_start_pages=[u'עמוד_ראשי']),
|
||||
'Croatian': Language(name='Croatian',
|
||||
iso_code='hr',
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
|
||||
u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
|
||||
wiki_start_pages=[u'Glavna_stranica']),
|
||||
'Hungarian': Language(name='Hungarian',
|
||||
iso_code='hu',
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
|
||||
u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
|
||||
wiki_start_pages=[u'Kezdőlap']),
|
||||
'Italian': Language(name='Italian',
|
||||
iso_code='it',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
|
||||
wiki_start_pages=[u'Pagina_principale']),
|
||||
'Lithuanian': Language(name='Lithuanian',
|
||||
iso_code='lt',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
||||
'ISO-8859-4'],
|
||||
# Q, W, and X not used at all
|
||||
alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
|
||||
u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
|
||||
wiki_start_pages=[u'Pagrindinis_puslapis']),
|
||||
'Latvian': Language(name='Latvian',
|
||||
iso_code='lv',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
||||
'ISO-8859-4'],
|
||||
# Q, W, X, Y are only for loanwords
|
||||
alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
|
||||
u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
|
||||
wiki_start_pages=[u'Sākumlapa']),
|
||||
'Macedonian': Language(name='Macedonian',
|
||||
iso_code='mk',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'MacCyrillic', 'IBM855'],
|
||||
alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
|
||||
u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
|
||||
wiki_start_pages=[u'Главна_страница']),
|
||||
'Dutch': Language(name='Dutch',
|
||||
iso_code='nl',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||
wiki_start_pages=[u'Hoofdpagina']),
|
||||
'Polish': Language(name='Polish',
|
||||
iso_code='pl',
|
||||
# Q and X are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
|
||||
u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
|
||||
wiki_start_pages=[u'Wikipedia:Strona_główna']),
|
||||
'Portuguese': Language(name='Portuguese',
|
||||
iso_code='pt',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||
'WINDOWS-1252'],
|
||||
alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
|
||||
wiki_start_pages=[u'Wikipédia:Página_principal']),
|
||||
'Romanian': Language(name='Romanian',
|
||||
iso_code='ro',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=u'ăâîșțĂÂÎȘȚ',
|
||||
wiki_start_pages=[u'Pagina_principală']),
|
||||
'Russian': Language(name='Russian',
|
||||
iso_code='ru',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'KOI8-R', 'MacCyrillic', 'IBM866',
|
||||
'IBM855'],
|
||||
alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
|
||||
u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
|
||||
wiki_start_pages=[u'Заглавная_страница']),
|
||||
'Slovak': Language(name='Slovak',
|
||||
iso_code='sk',
|
||||
use_ascii=True,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
|
||||
wiki_start_pages=[u'Hlavná_stránka']),
|
||||
'Slovene': Language(name='Slovene',
|
||||
iso_code='sl',
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||
alphabet=(u'abcčdefghijklmnoprsštuvzž'
|
||||
u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
|
||||
wiki_start_pages=[u'Glavna_stran']),
|
||||
# Serbian can be written in both Latin and Cyrillic, but there's no
|
||||
# simple way to get the Latin alphabet pages from Wikipedia through
|
||||
# the API, so for now we just support Cyrillic.
|
||||
'Serbian': Language(name='Serbian',
|
||||
iso_code='sr',
|
||||
alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
|
||||
u'абвгдђежзијклљмнњопрстћуфхцчџш'),
|
||||
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||
'MacCyrillic', 'IBM855'],
|
||||
wiki_start_pages=[u'Главна_страна']),
|
||||
'Thai': Language(name='Thai',
|
||||
iso_code='th',
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
|
||||
alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
|
||||
wiki_start_pages=[u'หน้าหลัก']),
|
||||
'Turkish': Language(name='Turkish',
|
||||
iso_code='tr',
|
||||
# Q, W, and X are not used by Turkish
|
||||
use_ascii=False,
|
||||
charsets=['ISO-8859-3', 'ISO-8859-9',
|
||||
'WINDOWS-1254'],
|
||||
alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
|
||||
u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
|
||||
wiki_start_pages=[u'Ana_Sayfa']),
|
||||
'Vietnamese': Language(name='Vietnamese',
|
||||
iso_code='vi',
|
||||
use_ascii=False,
|
||||
# Windows-1258 is the only common 8-bit
|
||||
# Vietnamese encoding supported by Python.
|
||||
# From Wikipedia:
|
||||
# For systems that lack support for Unicode,
|
||||
# dozens of 8-bit Vietnamese code pages are
|
||||
# available.[1] The most common are VISCII
|
||||
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
|
||||
# Where ASCII is required, such as when
|
||||
# ensuring readability in plain text e-mail,
|
||||
# Vietnamese letters are often encoded
|
||||
# according to Vietnamese Quoted-Readable
|
||||
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
|
||||
# though usage of either variable-width
|
||||
# scheme has declined dramatically following
|
||||
# the adoption of Unicode on the World Wide
|
||||
# Web.
|
||||
charsets=['WINDOWS-1258'],
|
||||
alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
|
||||
u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
|
||||
wiki_start_pages=[u'Chữ_Quốc_ngữ']),
|
||||
}
|
||||
LANGUAGES = {
|
||||
"Arabic": Language(
|
||||
name="Arabic",
|
||||
iso_code="ar",
|
||||
use_ascii=False,
|
||||
# We only support encodings that use isolated
|
||||
# forms, because the current recommendation is
|
||||
# that the rendering system handles presentation
|
||||
# forms. This means we purposefully skip IBM864.
|
||||
charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
|
||||
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
|
||||
wiki_start_pages=["الصفحة_الرئيسية"],
|
||||
),
|
||||
"Belarusian": Language(
|
||||
name="Belarusian",
|
||||
iso_code="be",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
|
||||
alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
|
||||
wiki_start_pages=["Галоўная_старонка"],
|
||||
),
|
||||
"Bulgarian": Language(
|
||||
name="Bulgarian",
|
||||
iso_code="bg",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
|
||||
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
|
||||
wiki_start_pages=["Начална_страница"],
|
||||
),
|
||||
"Czech": Language(
|
||||
name="Czech",
|
||||
iso_code="cz",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
|
||||
wiki_start_pages=["Hlavní_strana"],
|
||||
),
|
||||
"Danish": Language(
|
||||
name="Danish",
|
||||
iso_code="da",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="æøåÆØÅ",
|
||||
wiki_start_pages=["Forside"],
|
||||
),
|
||||
"German": Language(
|
||||
name="German",
|
||||
iso_code="de",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="äöüßẞÄÖÜ",
|
||||
wiki_start_pages=["Wikipedia:Hauptseite"],
|
||||
),
|
||||
"Greek": Language(
|
||||
name="Greek",
|
||||
iso_code="el",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-7", "WINDOWS-1253"],
|
||||
alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
|
||||
wiki_start_pages=["Πύλη:Κύρια"],
|
||||
),
|
||||
"English": Language(
|
||||
name="English",
|
||||
iso_code="en",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||
wiki_start_pages=["Main_Page"],
|
||||
),
|
||||
"Esperanto": Language(
|
||||
name="Esperanto",
|
||||
iso_code="eo",
|
||||
# Q, W, X, and Y not used at all
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-3"],
|
||||
alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
|
||||
wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
|
||||
),
|
||||
"Spanish": Language(
|
||||
name="Spanish",
|
||||
iso_code="es",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
|
||||
wiki_start_pages=["Wikipedia:Portada"],
|
||||
),
|
||||
"Estonian": Language(
|
||||
name="Estonian",
|
||||
iso_code="et",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
|
||||
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
||||
# loanwords
|
||||
alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
|
||||
wiki_start_pages=["Esileht"],
|
||||
),
|
||||
"Finnish": Language(
|
||||
name="Finnish",
|
||||
iso_code="fi",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÅÄÖŠŽåäöšž",
|
||||
wiki_start_pages=["Wikipedia:Etusivu"],
|
||||
),
|
||||
"French": Language(
|
||||
name="French",
|
||||
iso_code="fr",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
|
||||
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
|
||||
),
|
||||
"Hebrew": Language(
|
||||
name="Hebrew",
|
||||
iso_code="he",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-8", "WINDOWS-1255"],
|
||||
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
|
||||
wiki_start_pages=["עמוד_ראשי"],
|
||||
),
|
||||
"Croatian": Language(
|
||||
name="Croatian",
|
||||
iso_code="hr",
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
|
||||
wiki_start_pages=["Glavna_stranica"],
|
||||
),
|
||||
"Hungarian": Language(
|
||||
name="Hungarian",
|
||||
iso_code="hu",
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
|
||||
wiki_start_pages=["Kezdőlap"],
|
||||
),
|
||||
"Italian": Language(
|
||||
name="Italian",
|
||||
iso_code="it",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
|
||||
wiki_start_pages=["Pagina_principale"],
|
||||
),
|
||||
"Lithuanian": Language(
|
||||
name="Lithuanian",
|
||||
iso_code="lt",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||||
# Q, W, and X not used at all
|
||||
alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
|
||||
wiki_start_pages=["Pagrindinis_puslapis"],
|
||||
),
|
||||
"Latvian": Language(
|
||||
name="Latvian",
|
||||
iso_code="lv",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||||
# Q, W, X, Y are only for loanwords
|
||||
alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
|
||||
wiki_start_pages=["Sākumlapa"],
|
||||
),
|
||||
"Macedonian": Language(
|
||||
name="Macedonian",
|
||||
iso_code="mk",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||||
alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
|
||||
wiki_start_pages=["Главна_страница"],
|
||||
),
|
||||
"Dutch": Language(
|
||||
name="Dutch",
|
||||
iso_code="nl",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||
wiki_start_pages=["Hoofdpagina"],
|
||||
),
|
||||
"Polish": Language(
|
||||
name="Polish",
|
||||
iso_code="pl",
|
||||
# Q and X are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
|
||||
wiki_start_pages=["Wikipedia:Strona_główna"],
|
||||
),
|
||||
"Portuguese": Language(
|
||||
name="Portuguese",
|
||||
iso_code="pt",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
|
||||
wiki_start_pages=["Wikipédia:Página_principal"],
|
||||
),
|
||||
"Romanian": Language(
|
||||
name="Romanian",
|
||||
iso_code="ro",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="ăâîșțĂÂÎȘȚ",
|
||||
wiki_start_pages=["Pagina_principală"],
|
||||
),
|
||||
"Russian": Language(
|
||||
name="Russian",
|
||||
iso_code="ru",
|
||||
use_ascii=False,
|
||||
charsets=[
|
||||
"ISO-8859-5",
|
||||
"WINDOWS-1251",
|
||||
"KOI8-R",
|
||||
"MacCyrillic",
|
||||
"IBM866",
|
||||
"IBM855",
|
||||
],
|
||||
alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
|
||||
wiki_start_pages=["Заглавная_страница"],
|
||||
),
|
||||
"Slovak": Language(
|
||||
name="Slovak",
|
||||
iso_code="sk",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
|
||||
wiki_start_pages=["Hlavná_stránka"],
|
||||
),
|
||||
"Slovene": Language(
|
||||
name="Slovene",
|
||||
iso_code="sl",
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
|
||||
wiki_start_pages=["Glavna_stran"],
|
||||
),
|
||||
# Serbian can be written in both Latin and Cyrillic, but there's no
|
||||
# simple way to get the Latin alphabet pages from Wikipedia through
|
||||
# the API, so for now we just support Cyrillic.
|
||||
"Serbian": Language(
|
||||
name="Serbian",
|
||||
iso_code="sr",
|
||||
alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||||
wiki_start_pages=["Главна_страна"],
|
||||
),
|
||||
"Thai": Language(
|
||||
name="Thai",
|
||||
iso_code="th",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-11", "TIS-620", "CP874"],
|
||||
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
|
||||
wiki_start_pages=["หน้าหลัก"],
|
||||
),
|
||||
"Turkish": Language(
|
||||
name="Turkish",
|
||||
iso_code="tr",
|
||||
# Q, W, and X are not used by Turkish
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
|
||||
alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
|
||||
wiki_start_pages=["Ana_Sayfa"],
|
||||
),
|
||||
"Vietnamese": Language(
|
||||
name="Vietnamese",
|
||||
iso_code="vi",
|
||||
use_ascii=False,
|
||||
# Windows-1258 is the only common 8-bit
|
||||
# Vietnamese encoding supported by Python.
|
||||
# From Wikipedia:
|
||||
# For systems that lack support for Unicode,
|
||||
# dozens of 8-bit Vietnamese code pages are
|
||||
# available.[1] The most common are VISCII
|
||||
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
|
||||
# Where ASCII is required, such as when
|
||||
# ensuring readability in plain text e-mail,
|
||||
# Vietnamese letters are often encoded
|
||||
# according to Vietnamese Quoted-Readable
|
||||
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
|
||||
# though usage of either variable-width
|
||||
# scheme has declined dramatically following
|
||||
# the adoption of Unicode on the World Wide
|
||||
# Web.
|
||||
charsets=["WINDOWS-1258"],
|
||||
alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
|
||||
wiki_start_pages=["Chữ_Quốc_ngữ"],
|
||||
),
|
||||
}
|
||||
|
Reference in New Issue
Block a user