Jump to content

Module:Sandbox/AbstractWikipedia/Phonotactics/en

From Meta, a Wikimedia project coordination wiki
Module documentation

This is the English-phonotactics module of the Abstract Wikipedia Scribunto-based NLG prototype. For other phonotactics modules see the main page.

The English implementation scans the lexemes for the indefinite article a (identified by its lemma and part-of-speech). If such a lexeme is found, the following lexeme (ignoring spacing and empty forms) is inspected, and if it starts with a vowel, the article's form is replaced by the form an. Note that in the current implementation, a simple list of regular expression is used to determine whether a form starts with a vowel. Ideally, this information should be stored and fetched from Wikidata for each lexeme.


local p = {}

-- Ideally lexemes should be annotated for their phonetic makeup, but currently
-- we use a simple heuristic
local function startsWithVowel ( text ) 
	text = mw.getLanguage("en"):lc(text)
	local patterns = { "[aio]", "e[^u]", "un[^i]" } -- could be more exact
	for _, pattern in ipairs(patterns) do
		if text:match("^"..pattern) then
			return true
		end
	end
	return false
end

-- Returns the first non-empty, non-spacing lexeme after index
local function followingLexeme ( lexemes, index )
	for i=index+1, #lexemes do
		if (lexemes[i].pos ~= 'spacing' and tostring(lexemes[i] ~= '')) then
			return lexemes[i]
		end
	end
	return nil
end

function p.applyPhonotactics ( lexemes ) 
	for index, lexeme in ipairs(lexemes) do
		-- There is a single phonotactic rule in English: a->an
		if (lexeme.pos == 'article' and lexeme.lemma == 'a') then
			if startsWithVowel(tostring(followingLexeme(lexemes, index))) then
				-- Clear all forms and add 'an'
				lexeme.replaceByForm('an')
				mw.log("Phonotactics module modified indefinite article")
				lexeme.log()
			end
		end
	end
end

return p