Jump to content

Module:Sandbox/AbstractWikipedia/GrammaticalFeatures

From Meta, a Wikimedia project coordination wiki
Module documentation

This module is part of the Abstract Wikipedia Scribunto-based NLG prototype. It specifies mapping from Wikidata grammatical features to an internal representation of features (associated with categories), and provides also an ordering of the features to be used when sorting lexeme forms in lexicographic order, in the sortForms function of the lexemes module.


local p = {}

-- Mapping of grammatical feature Q-ids in Wikidata to internal representation
-- Each internal feature is associated with a category.
-- Note that a Wikidata feature may be expanded to numerous (or in rare cases
-- zero) internal features.
-- Note that the same feature values can be used with different categories, and 
-- thus the importance of the mapping.
-- We use English names for convenience (as these are also used in the Relations
-- module), but this can be revisited.
p.features_map = {
	Q499327 = { gender = "masculine" },
	Q1775415 = { gender = "feminine" },
	Q1775461 = { gender = "neuter" },
	Q1305037 = { gender = "common" },
	Q110786 = { number = "singular" },
	Q146786 = { number = "plural" },
	Q47088290 = { gender = "masculine", number = "singular" },
	Q47088292 = { gender = "masculine", number = "plural" },
	Q47088293 = { gender = "feminine", number = "singular" },
	Q47088295 = { gender = "feminine", number = "plural" },
	Q131105 = { case = "nominative" },
	Q146078 = { case = "accusative" },
	Q146233 = { case = "genitive" },
	Q145599 = { case = "dative" },
	Q51929074 = { person = "third" },
	Q51929049 = { person = "second" },
	Q21714344 = { person = "first" },
	Q51929447 = { person = "third", number = "singular" }, -- 3rd person sing.
	Q51929218 = { person = "first", number = "singular" }, -- 1st person sing.
	Q69761633 = { possessive_gender = "masculine" },
	Q69761768 = { possessive_gender = "feminine" },
	Q71469738 = { }, -- poss. masc. or fem. mapped to no feature
	Q71470909 = { possessive_person = "third" },
	Q71470837 = { possessive_person = "second" },
	Q71470598 = { possessive_person = "first" },
	Q78191294 = { possessive_number = "singular" },
	Q78191289 = { possessive_number = "plural" },
	Q682111 = { mood = "indicative" },
	Q179230 = { mood = "infinitive" },
	Q56682909 = { mood = "indicative", tense = "present" }, -- present indicative
	Q3910936 = { mood = "indicative", tense = "present" }, -- simple present
	Q442485 = { tense = "past", aspect = "perfective"},  -- preterite
	Q1392475 = { tense = "past", aspect = "perfective"}, -- simple past
	Q1994301 = { tense = "past" },  -- past tense
	Q12547192 = { tense = "past", aspect = "imperfective" },  -- past imperfect
	Q1230649 = { mood = "participle", tense = "past" }, -- English past participle
	Q12717679 = { mood = "participle", tense = "past" },  -- past participle
	Q10345583 = { mood = "participle", tense = "present" },  -- present participle
	Q473746 = { mood = "subjunctive" },
	Q22716 = { mood = "imperative" },
	Q126473 = { contraction = "contraction" }, -- Indicates contracted forms
	Q53997851 = { definiteness = "definite" },
	-- Expand as needed
}

-- Map of Q-ids to English names of parts-of-speech.
-- We use English names for convenience (as these are also used in the Relations
-- module) but this can be revisited.
p.categories_map = { Q1084 = "noun", Q24905 = "verb", Q34698 = "adjective", Q103184 = "article", Q4833830 = "preposition"}

-- The following array gives us the cannonical order of the forms (to be used
-- by the lexeme module's sortForms function). The forms are sorted in a 
-- lexicographical order, iterating over the at the categories in the order 
-- given below,  within each category ranking the features as given below
-- (where unmentioned features are considered greatest).
p.cannonical_order = {
	{ category = "person", third = 1, second = 2, first = 3 },
	{ category = "mood", indicative = 1, infinitive = 2, imperative = 3, subjunctive = 3 },
	{ category = "tense", present = 1, past = 2},
	{ category = "number", singular = 1, plural = 2 },
	{ category = "gender", masculine = 1, feminine = 2, neuter = 3, common = 4 },
	{ category = "case", nominative = 1, accusative = 2, genitive = 3, dative = 4 },
}

return p