Jump to content

Language tools/checkutftable

From Meta, a Wikimedia project coordination wiki

Note that program is under GNU GPL, not under GNU FDL. If you contribute to this program, please, add your name inside of copyright notice.

I made this module for checking is the text on Serbian Wikipedia in Cyrillic or in Latin script. However, I realized that it can be useful for other languages, too. Characters which do not belong strictly to any script are described as 'not important'. If character is not recognized by unicodedata module, it is described as 'not known'.

#!/usr/bin/python
#
# Copyright (C) 2006 Milos Rancic
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# 
# Text of GNU GPL license can be found at http://www.gnu.org/licenses/gnu.html

import sys
import unicodedata

# engine
def check(f):
	text = file(f).read().decode('utf-8');
	numchar = {}
	numchar['all'] = 0
	numchar['ni'] = 0 #not important
	numchar['nnnn'] = 0 #not known
	
	numchar['ARAB'] = 0 #ARABIC
	numchar['ARME'] = 0 #ARMENIAN
	numchar['BENG'] = 0 #BENGALI
	numchar['BOPO'] = 0 #BOPOMOFO
	numchar['BRAI'] = 0 #BRAILLE PATTERN
	numchar['BUHI'] = 0 #BUHID
	numchar['CANA'] = 0 #CANADIAN
	numchar['CHER'] = 0 #CHEROKEE
	numchar['CJK '] = 0 #CJK
	numchar['COMB'] = 0 #COMBINING
	numchar['COPT'] = 0 #COPTIC
	numchar['CYRI'] = 0 #CYRILLIC
	numchar['DESE'] = 0 #DESERET
	numchar['DEVA'] = 0 #DEVANGARI
	numchar['ETHI'] = 0 #ETHIOPIC
	numchar['EXTE'] = 0 #EXTENDED ARABIC
	numchar['GEOR'] = 0 #GEORGIAN
	numchar['GOTH'] = 0 #GOTHIC
	numchar['GREE'] = 0 #GREEK
	numchar['GUJA'] = 0 #GUJARATI
	numchar['GURM'] = 0 #GURMUKHI
	numchar['HANG'] = 0 #HANGUL
	numchar['HANU'] = 0 #HANUNOO
	numchar['HEBR'] = 0 #HEBREW
	numchar['HIRA'] = 0 #HIRAGANA
	numchar['IDEO'] = 0 #IDEOGRAPHIC
	numchar['KANG'] = 0 #KANGXI RADICAL
	numchar['KANN'] = 0 #KANNADA
	numchar['KATA'] = 0 #KATAKANA
	numchar['KHME'] = 0 #KHMER
	numchar['LAO '] = 0 #LAO 
	numchar['LATI'] = 0 #LATIN
	numchar['MALA'] = 0 #MALAYALAM
	numchar['MODI'] = 0 #MODIFIER LETTER
	numchar['MONG'] = 0 #MONGOLIAN
	numchar['MYAN'] = 0 #MYANMAR
	numchar['OGHA'] = 0 #OGHAM
	numchar['OLD '] = 0 #OLD ITALIC
	numchar['ORIY'] = 0 #ORIYA
	numchar['PHIL'] = 0 #PHILIPPINE
	numchar['RUNI'] = 0 #RUNIC
	numchar['SINH'] = 0 #SINHALA
	numchar['SYRI'] = 0 #SYRIAC
	numchar['TAGA'] = 0 #TAGALOG
	numchar['TAGB'] = 0 #TAGBANWA
	numchar['TAMI'] = 0 #TAMIL
	numchar['TELU'] = 0 #TELUGU
	numchar['THAA'] = 0 #THAANA
	numchar['THAI'] = 0 #THAI
	numchar['TIBE'] = 0 #TIBETAN
	numchar['YI R'] = 0 #YI RADICAL
	numchar['YI S'] = 0 #YI SYLLABLE

	for t in range(0,len(text)):
		character = unicodedata.name(text[t],"nnnn")
		if len(character) < 4:
			character += ' '
		char = character[0:4]
		if char not in numchar:
			char = 'ni'
		numchar[char] += 1
		numchar['all'] += 1
	return numchar

# the list of existing character groups
def describe(d):
	perc = {}
	out = {}
	for n in d:
		if (d[n] > 0) and (n != 'all'):
			perc[n] = float(d[n])/float(d['all'])
			out[n] = [ n, perc[n] ]
	return out

# what group of characters has absolute majority, what relative
def decide(d):
	izbor = ''
	perc = {}
	ctrl = 0
	for n in d:
		if (d[n] > 0) and (n != 'all'):
			perc[n] = float(d[n])/float(d['all'])
			if perc[n] > ctrl:
				ctrl = perc[n]
				czbor = n
			if perc[n] > 0.5:
				izbor = n
	if izbor == '':
		out = [ 'r', czbor, ctrl ]
	else:
		out = [ 'a', izbor, perc[izbor] ]
	return out

# for Serbian (Wikipedia): what is the script of the text
def analyzecyr(d):
	izbor = ''
	perc = {}
	ctrl = 0
	if 'CYRI' in d and 'LATI' in d:
		d['cyrlat'] = d['CYRI'] + d['LATI']
		perc['CYRI'] = float(d['CYRI'])/float(d['cyrlat'])
		perc['LATI'] = float(d['LATI'])/float(d['cyrlat'])
		if perc['CYRI'] > 0.5:
			ra = 'a'
			izbor = 'CYRI'
		elif perc['CYRI'] > 0.3:
			ra = 'r'
			izbor = 'CYRI'
		elif perc['LATI'] > 0.5:
			ra = 'a'
			izbor = 'LATI'
		elif perc['LATI'] > 0.3:
			ra = 'r'
			izbor = 'LATI'
		else:
			ra = 'n'
			izbor = 'n'
			perc['n'] = 0
	out = [ ra, izbor, perc[izbor] ]
	return out	

# if the program is started from the command line
try:
	fl = sys.argv[1]
	dictionary = check(fl)
	print describe(dictionary)
	print decide(dictionary)
	print analyzecyr(dictionary)
except IndexError:
	pass