Deploy site
This commit is contained in:
Binary file not shown.
Binary file not shown.
@ -0,0 +1,352 @@
|
||||
"""
|
||||
Metadata about languages used by our model training code for our
|
||||
SingleByteCharSetProbers. Could be used for other things in the future.
|
||||
|
||||
This code is based on the language metadata from the uchardet project.
|
||||
"""
|
||||
|
||||
from string import ascii_letters
|
||||
from typing import List, Optional
|
||||
|
||||
# TODO: Add Ukrainian (KOI8-U)
|
||||
|
||||
|
||||
class Language:
|
||||
"""Metadata about a language useful for training models
|
||||
|
||||
:ivar name: The human name for the language, in English.
|
||||
:type name: str
|
||||
:ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
or use another catalog as a last resort.
|
||||
:type iso_code: str
|
||||
:ivar use_ascii: Whether or not ASCII letters should be included in trained
|
||||
models.
|
||||
:type use_ascii: bool
|
||||
:ivar charsets: The charsets we want to support and create data for.
|
||||
:type charsets: list of str
|
||||
:ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
|
||||
`True`, you only need to add those not in the ASCII set.
|
||||
:type alphabet: str
|
||||
:ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
|
||||
Wikipedia for training data.
|
||||
:type wiki_start_pages: list of str
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: Optional[str] = None,
|
||||
iso_code: Optional[str] = None,
|
||||
use_ascii: bool = True,
|
||||
charsets: Optional[List[str]] = None,
|
||||
alphabet: Optional[str] = None,
|
||||
wiki_start_pages: Optional[List[str]] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.name = name
|
||||
self.iso_code = iso_code
|
||||
self.use_ascii = use_ascii
|
||||
self.charsets = charsets
|
||||
if self.use_ascii:
|
||||
if alphabet:
|
||||
alphabet += ascii_letters
|
||||
else:
|
||||
alphabet = ascii_letters
|
||||
elif not alphabet:
|
||||
raise ValueError("Must supply alphabet if use_ascii is False")
|
||||
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
|
||||
self.wiki_start_pages = wiki_start_pages
|
||||
|
||||
def __repr__(self) -> str:
|
||||
param_str = ", ".join(
|
||||
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
||||
)
|
||||
return f"{self.__class__.__name__}({param_str})"
|
||||
|
||||
|
||||
LANGUAGES = {
|
||||
"Arabic": Language(
|
||||
name="Arabic",
|
||||
iso_code="ar",
|
||||
use_ascii=False,
|
||||
# We only support encodings that use isolated
|
||||
# forms, because the current recommendation is
|
||||
# that the rendering system handles presentation
|
||||
# forms. This means we purposefully skip IBM864.
|
||||
charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
|
||||
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
|
||||
wiki_start_pages=["الصفحة_الرئيسية"],
|
||||
),
|
||||
"Belarusian": Language(
|
||||
name="Belarusian",
|
||||
iso_code="be",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
|
||||
alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
|
||||
wiki_start_pages=["Галоўная_старонка"],
|
||||
),
|
||||
"Bulgarian": Language(
|
||||
name="Bulgarian",
|
||||
iso_code="bg",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
|
||||
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
|
||||
wiki_start_pages=["Начална_страница"],
|
||||
),
|
||||
"Czech": Language(
|
||||
name="Czech",
|
||||
iso_code="cz",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
|
||||
wiki_start_pages=["Hlavní_strana"],
|
||||
),
|
||||
"Danish": Language(
|
||||
name="Danish",
|
||||
iso_code="da",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="æøåÆØÅ",
|
||||
wiki_start_pages=["Forside"],
|
||||
),
|
||||
"German": Language(
|
||||
name="German",
|
||||
iso_code="de",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="äöüßẞÄÖÜ",
|
||||
wiki_start_pages=["Wikipedia:Hauptseite"],
|
||||
),
|
||||
"Greek": Language(
|
||||
name="Greek",
|
||||
iso_code="el",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-7", "WINDOWS-1253"],
|
||||
alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
|
||||
wiki_start_pages=["Πύλη:Κύρια"],
|
||||
),
|
||||
"English": Language(
|
||||
name="English",
|
||||
iso_code="en",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||
wiki_start_pages=["Main_Page"],
|
||||
),
|
||||
"Esperanto": Language(
|
||||
name="Esperanto",
|
||||
iso_code="eo",
|
||||
# Q, W, X, and Y not used at all
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-3"],
|
||||
alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
|
||||
wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
|
||||
),
|
||||
"Spanish": Language(
|
||||
name="Spanish",
|
||||
iso_code="es",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
|
||||
wiki_start_pages=["Wikipedia:Portada"],
|
||||
),
|
||||
"Estonian": Language(
|
||||
name="Estonian",
|
||||
iso_code="et",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
|
||||
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
||||
# loanwords
|
||||
alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
|
||||
wiki_start_pages=["Esileht"],
|
||||
),
|
||||
"Finnish": Language(
|
||||
name="Finnish",
|
||||
iso_code="fi",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÅÄÖŠŽåäöšž",
|
||||
wiki_start_pages=["Wikipedia:Etusivu"],
|
||||
),
|
||||
"French": Language(
|
||||
name="French",
|
||||
iso_code="fr",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
|
||||
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
|
||||
),
|
||||
"Hebrew": Language(
|
||||
name="Hebrew",
|
||||
iso_code="he",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-8", "WINDOWS-1255"],
|
||||
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
|
||||
wiki_start_pages=["עמוד_ראשי"],
|
||||
),
|
||||
"Croatian": Language(
|
||||
name="Croatian",
|
||||
iso_code="hr",
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
|
||||
wiki_start_pages=["Glavna_stranica"],
|
||||
),
|
||||
"Hungarian": Language(
|
||||
name="Hungarian",
|
||||
iso_code="hu",
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
|
||||
wiki_start_pages=["Kezdőlap"],
|
||||
),
|
||||
"Italian": Language(
|
||||
name="Italian",
|
||||
iso_code="it",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
|
||||
wiki_start_pages=["Pagina_principale"],
|
||||
),
|
||||
"Lithuanian": Language(
|
||||
name="Lithuanian",
|
||||
iso_code="lt",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||||
# Q, W, and X not used at all
|
||||
alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
|
||||
wiki_start_pages=["Pagrindinis_puslapis"],
|
||||
),
|
||||
"Latvian": Language(
|
||||
name="Latvian",
|
||||
iso_code="lv",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||||
# Q, W, X, Y are only for loanwords
|
||||
alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
|
||||
wiki_start_pages=["Sākumlapa"],
|
||||
),
|
||||
"Macedonian": Language(
|
||||
name="Macedonian",
|
||||
iso_code="mk",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||||
alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
|
||||
wiki_start_pages=["Главна_страница"],
|
||||
),
|
||||
"Dutch": Language(
|
||||
name="Dutch",
|
||||
iso_code="nl",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||
wiki_start_pages=["Hoofdpagina"],
|
||||
),
|
||||
"Polish": Language(
|
||||
name="Polish",
|
||||
iso_code="pl",
|
||||
# Q and X are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
|
||||
wiki_start_pages=["Wikipedia:Strona_główna"],
|
||||
),
|
||||
"Portuguese": Language(
|
||||
name="Portuguese",
|
||||
iso_code="pt",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
|
||||
wiki_start_pages=["Wikipédia:Página_principal"],
|
||||
),
|
||||
"Romanian": Language(
|
||||
name="Romanian",
|
||||
iso_code="ro",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="ăâîșțĂÂÎȘȚ",
|
||||
wiki_start_pages=["Pagina_principală"],
|
||||
),
|
||||
"Russian": Language(
|
||||
name="Russian",
|
||||
iso_code="ru",
|
||||
use_ascii=False,
|
||||
charsets=[
|
||||
"ISO-8859-5",
|
||||
"WINDOWS-1251",
|
||||
"KOI8-R",
|
||||
"MacCyrillic",
|
||||
"IBM866",
|
||||
"IBM855",
|
||||
],
|
||||
alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
|
||||
wiki_start_pages=["Заглавная_страница"],
|
||||
),
|
||||
"Slovak": Language(
|
||||
name="Slovak",
|
||||
iso_code="sk",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
|
||||
wiki_start_pages=["Hlavná_stránka"],
|
||||
),
|
||||
"Slovene": Language(
|
||||
name="Slovene",
|
||||
iso_code="sl",
|
||||
# Q, W, X, Y are only used for foreign words.
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||||
alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
|
||||
wiki_start_pages=["Glavna_stran"],
|
||||
),
|
||||
# Serbian can be written in both Latin and Cyrillic, but there's no
|
||||
# simple way to get the Latin alphabet pages from Wikipedia through
|
||||
# the API, so for now we just support Cyrillic.
|
||||
"Serbian": Language(
|
||||
name="Serbian",
|
||||
iso_code="sr",
|
||||
alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
|
||||
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||||
wiki_start_pages=["Главна_страна"],
|
||||
),
|
||||
"Thai": Language(
|
||||
name="Thai",
|
||||
iso_code="th",
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-11", "TIS-620", "CP874"],
|
||||
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
|
||||
wiki_start_pages=["หน้าหลัก"],
|
||||
),
|
||||
"Turkish": Language(
|
||||
name="Turkish",
|
||||
iso_code="tr",
|
||||
# Q, W, and X are not used by Turkish
|
||||
use_ascii=False,
|
||||
charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
|
||||
alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
|
||||
wiki_start_pages=["Ana_Sayfa"],
|
||||
),
|
||||
"Vietnamese": Language(
|
||||
name="Vietnamese",
|
||||
iso_code="vi",
|
||||
use_ascii=False,
|
||||
# Windows-1258 is the only common 8-bit
|
||||
# Vietnamese encoding supported by Python.
|
||||
# From Wikipedia:
|
||||
# For systems that lack support for Unicode,
|
||||
# dozens of 8-bit Vietnamese code pages are
|
||||
# available.[1] The most common are VISCII
|
||||
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
|
||||
# Where ASCII is required, such as when
|
||||
# ensuring readability in plain text e-mail,
|
||||
# Vietnamese letters are often encoded
|
||||
# according to Vietnamese Quoted-Readable
|
||||
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
|
||||
# though usage of either variable-width
|
||||
# scheme has declined dramatically following
|
||||
# the adoption of Unicode on the World Wide
|
||||
# Web.
|
||||
charsets=["WINDOWS-1258"],
|
||||
alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
|
||||
wiki_start_pages=["Chữ_Quốc_ngữ"],
|
||||
),
|
||||
}
|
Reference in New Issue
Block a user