Language tools/checkutftable
Appearance
Note that program is under GNU GPL, not under GNU FDL. If you contribute to this program, please, add your name inside of copyright notice.
I made this module for checking is the text on Serbian Wikipedia in Cyrillic or in Latin script. However, I realized that it can be useful for other languages, too. Characters which do not belong strictly to any script are described as 'not important'. If character is not recognized by unicodedata module, it is described as 'not known'.
#!/usr/bin/python # # Copyright (C) 2006 Milos Rancic # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # Text of GNU GPL license can be found at http://www.gnu.org/licenses/gnu.html import sys import unicodedata # engine def check(f): text = file(f).read().decode('utf-8'); numchar = {} numchar['all'] = 0 numchar['ni'] = 0 #not important numchar['nnnn'] = 0 #not known numchar['ARAB'] = 0 #ARABIC numchar['ARME'] = 0 #ARMENIAN numchar['BENG'] = 0 #BENGALI numchar['BOPO'] = 0 #BOPOMOFO numchar['BRAI'] = 0 #BRAILLE PATTERN numchar['BUHI'] = 0 #BUHID numchar['CANA'] = 0 #CANADIAN numchar['CHER'] = 0 #CHEROKEE numchar['CJK '] = 0 #CJK numchar['COMB'] = 0 #COMBINING numchar['COPT'] = 0 #COPTIC numchar['CYRI'] = 0 #CYRILLIC numchar['DESE'] = 0 #DESERET numchar['DEVA'] = 0 #DEVANGARI numchar['ETHI'] = 0 #ETHIOPIC numchar['EXTE'] = 0 #EXTENDED ARABIC numchar['GEOR'] = 0 #GEORGIAN numchar['GOTH'] = 0 #GOTHIC numchar['GREE'] = 0 #GREEK numchar['GUJA'] = 0 #GUJARATI numchar['GURM'] = 0 #GURMUKHI numchar['HANG'] = 0 #HANGUL numchar['HANU'] = 0 #HANUNOO numchar['HEBR'] = 0 #HEBREW numchar['HIRA'] = 0 #HIRAGANA numchar['IDEO'] = 0 #IDEOGRAPHIC numchar['KANG'] = 0 #KANGXI RADICAL numchar['KANN'] = 0 #KANNADA numchar['KATA'] = 0 #KATAKANA numchar['KHME'] = 0 #KHMER numchar['LAO '] = 0 #LAO numchar['LATI'] = 0 #LATIN numchar['MALA'] = 0 #MALAYALAM numchar['MODI'] = 0 #MODIFIER LETTER numchar['MONG'] = 0 #MONGOLIAN numchar['MYAN'] = 0 #MYANMAR numchar['OGHA'] = 0 #OGHAM numchar['OLD '] = 0 #OLD ITALIC numchar['ORIY'] = 0 #ORIYA numchar['PHIL'] = 0 #PHILIPPINE numchar['RUNI'] = 0 #RUNIC numchar['SINH'] = 0 #SINHALA numchar['SYRI'] = 0 #SYRIAC numchar['TAGA'] = 0 #TAGALOG numchar['TAGB'] = 0 #TAGBANWA numchar['TAMI'] = 0 #TAMIL numchar['TELU'] = 0 #TELUGU numchar['THAA'] = 0 #THAANA numchar['THAI'] = 0 #THAI numchar['TIBE'] = 0 #TIBETAN numchar['YI R'] = 0 #YI RADICAL numchar['YI S'] = 0 #YI SYLLABLE for t in range(0,len(text)): character = unicodedata.name(text[t],"nnnn") if len(character) < 4: character += ' ' char = character[0:4] if char not in numchar: char = 'ni' numchar[char] += 1 numchar['all'] += 1 return numchar # the list of existing character groups def describe(d): perc = {} out = {} for n in d: if (d[n] > 0) and (n != 'all'): perc[n] = float(d[n])/float(d['all']) out[n] = [ n, perc[n] ] return out # what group of characters has absolute majority, what relative def decide(d): izbor = '' perc = {} ctrl = 0 for n in d: if (d[n] > 0) and (n != 'all'): perc[n] = float(d[n])/float(d['all']) if perc[n] > ctrl: ctrl = perc[n] czbor = n if perc[n] > 0.5: izbor = n if izbor == '': out = [ 'r', czbor, ctrl ] else: out = [ 'a', izbor, perc[izbor] ] return out # for Serbian (Wikipedia): what is the script of the text def analyzecyr(d): izbor = '' perc = {} ctrl = 0 if 'CYRI' in d and 'LATI' in d: d['cyrlat'] = d['CYRI'] + d['LATI'] perc['CYRI'] = float(d['CYRI'])/float(d['cyrlat']) perc['LATI'] = float(d['LATI'])/float(d['cyrlat']) if perc['CYRI'] > 0.5: ra = 'a' izbor = 'CYRI' elif perc['CYRI'] > 0.3: ra = 'r' izbor = 'CYRI' elif perc['LATI'] > 0.5: ra = 'a' izbor = 'LATI' elif perc['LATI'] > 0.3: ra = 'r' izbor = 'LATI' else: ra = 'n' izbor = 'n' perc['n'] = 0 out = [ ra, izbor, perc[izbor] ] return out # if the program is started from the command line try: fl = sys.argv[1] dictionary = check(fl) print describe(dictionary) print decide(dictionary) print analyzecyr(dictionary) except IndexError: pass