Jump to content

Module:Sandbox/AbstractWikipedia/Phonotactics/he

From Meta, a Wikimedia project coordination wiki
Module documentation

This is the Hebrew-phonotactics module of the Abstract Wikipedia Scribunto-based NLG prototype. For other phonotactics modules see the main page.

The Hebrew implementation takes care of certain orthographic and phonotactic alternations happening after Hebrew proclitics. It scans the list of lexemes and if a proclitic, identified by its lemma, is found, the following lexemes may be altered in the following ways:

  • Spaces following proclitics are removed.
  • The definite article (identified by its part-of-speech) is removed following certain proclitics.
  • If the proclitic is followed by a number spelled out by digits, a hyphen is added, in accordance with Hebrew's orthographic conventions.
  • If a proclitic is followed by a word starting with the letter Vav, that letter is doubled, in accordance with Hebrew's writing rules of unvocalized text.


local p = {}

local function isProclitic ( lexeme )
	text = tostring(lexeme)
	return text == 'מ' or text == 'ש' or text == 'ה' or text == 'כ' or text == 'ל' or text == 'ב'	or text == 'ו'	
end

function p.applyPhonotactics ( lexemes ) 
	mw.log("Hebrew phonotactics")
	local last_lexeme = nil
	for index, lexeme in ipairs(lexemes) do
		if isProclitic(last_lexeme) then
			text = tostring(lexeme)
			mw.log("After proclitic: '"..text.."' ("..lexeme.pos..")")
			if lexeme.pos == 'spacing' then
				-- Remove spacing after proclitics
				lexeme.replaceByForm('')
			elseif lexeme.pos == 'article' then
				-- Omit article after certain prepositions
				if (prep == 'ב' or prep == 'ל' or prep == 'כ') then
					lexeme.replaceByForm('')
				end
			elseif text:match("^%d") then
				-- Add hypen between proclitic and numbers
				lexeme.replaceByForm('-'..text)
			elseif last_lexeme.lemma ~= 'ו' and text:match("^ו") and not text:match("^וו") then
				lexeme.replaceByForm('ו'..text)
			end
		end
		if lexeme.pos ~= 'spacing' and tostring(lexeme) ~= '' then
			last_lexeme = lexeme
		end
	end
end

return p