Module:Ancient Greek

local p = {}

local macron = mw.ustring.char(0x304) local breve = mw.ustring.char(0x306) local rough = mw.ustring.char(0x314) local smooth = mw.ustring.char(0x313) local diaeresis = mw.ustring.char(0x308) local acute = mw.ustring.char(0x301) local grave = mw.ustring.char(0x300) local circumflex = mw.ustring.char(0x342) local Latin_circumflex = mw.ustring.char(0x302) local subscript = mw.ustring.char(0x345) local macron_circumflex = macron .. diaeresis .. '?' .. Latin_circumflex

local is_velar = { ['κ'] = true, ['γ'] = true, ['χ'] = true, ['ξ'] = true, }

local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*" local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ

local info = {}

-- The tables are shared among different characters so that they can be checked -- for equality if needed, and to use less space. local vowel = { vowel = true, diacritic_seat = true } local iota = { vowel = true, diacritic_seat = true, offglide = true } local upsilon = { vowel = true, diacritic_seat = true, offglide = true } -- Technically rho is only a seat for rough or smooth breathing. local rho = { consonant = true, diacritic_seat = true } local consonant = { consonant = true } local diacritic = { diacritic = true } -- Needed for equality comparisons. local breathing = { diacritic = true }

local function add_info(characters, t)	if type(characters) == "string" then for character in string.gmatch(characters, UTF8_char) do			info[character] = t		end else for _, character in ipairs(characters) do			info[character] = t		end end end

add_info({ macron, breve,		diaeresis,		acute, grave, circumflex,		subscript,	}, diacritic)

add_info({rough, smooth}, breathing) add_info("ΑΕΗΟΩαεηοω", vowel) add_info("Ιι", iota) add_info("Υυ", upsilon) add_info("ΒΓΔΖΘΚΛΜΝΞΠΡΣΤΦΧΨϜϘϺϷͶϠβγδζθκλμνξπρσςτφχψϝϙϻϸͷϡ", consonant) add_info("Ρρ", rho)

local not_recognized = {} setmetatable(info, { __index =	function		return not_recognized	end })

local function quote(str) return "“" .. str .. "”" end

local correspondences = { -- Vowels ["α"] = "a", ["ε"] = "e", ["η"] = "e" .. macron, ["ι"] = "i", ["ο"] = "o", ["υ"] = "u", ["ω"] = "o" .. macron,

-- Consonants ["β"] = "b", ["γ"] = "g", ["δ"] = "d", ["ζ"] = "z", ["θ"] = "th", ["κ"] = "k", ["λ"] = "l", ["μ"] = "m", ["ν"] = "n", ["ξ"] = "x", ["π"] = "p", ["ρ"] = "r", ["σ"] = "s", ["ς"] = "s", ["τ"] = "t", ["φ"] = "ph", ["ψ"] = "ps", -- Archaic letters ["ϝ"] = "w", ["ϻ"] = "ś", ["ϙ"] = "q", ["ϡ"] = "š", ["ͷ"] = "v", -- Diacritics [smooth] = '', [rough] = '', -- h is added below in the `transliterate` function. [breve] = '', }

local ALA_LC = { ["χ"] = "ch", [acute] = '', [grave] = '', [circumflex] = '', [subscript] = '', [diaeresis] = '', [macron] = '', }

local Wiktionary_transliteration = { ["χ"] = "kh", [circumflex] = Latin_circumflex, [subscript] = 'i', }

local function add_index_metamethod(t, index_metamethod) local mt = getmetatable(t) if not mt then mt = {} setmetatable(t, mt) end mt.__index = index_metamethod end

--[=[		This breaks a word into meaningful "tokens", which are individual letters or diphthongs with their diacritics. Used by Module:grc-accent and Module:grc-pronunciation. --]=] local function tokenize(text) local tokens, vowel_info, prev_info = {}, {}, {} local token_i = 1 local prev for character in string.gmatch(mw.ustring.toNFD(text), UTF8_char) do		local curr_info = info[character] -- Split vowels between tokens if not a diphthong. if curr_info.vowel then if prev and (not (curr_info.offglide and prev_info.vowel)					-- υυ → υ, υ					-- ιυ → ι, υ					or prev_info.offglide and curr_info == upsilon) then token_i = token_i + 1 end tokens[token_i] = (tokens[token_i] or "") .. character table.insert(vowel_info, { index = token_i }) elseif curr_info.diacritic then tokens[token_i] = (tokens[token_i] or "") .. character if prev_info.vowel or prev_info.diacritic then if character == diaeresis then -- Current token is vowel, vowel, possibly other diacritics, -- and a diaeresis. -- Split the current token into two: -- the first letter, then the second letter plus any diacritics. local previous_vowel, vowel_with_diaeresis = string.match(tokens[token_i], "^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)")					if previous_vowel then tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis token_i = token_i + 1 end end elseif prev_info == rho then if curr_info ~= breathing then return string.format("The character %s cannot have the accent %s on it.", prev, "◌" .. character) end else error("The character " .. quote(prev) .. " cannot have a diacritic on it.") end elseif curr_info == rho then if prev and not (prev_info == breathing and info[string.match(tokens[token_i], "^" .. basic_Greek)] == rho) then token_i = token_i + 1 end tokens[token_i] = (tokens[token_i] or "") .. character else if prev then token_i = token_i + 1 end tokens[token_i] = (tokens[token_i] or "") .. character end prev = character prev_info = curr_info end return tokens end

function p.transliterate(text, system) add_index_metamethod(correspondences, system == "ALA-LC" and ALA_LC or Wiktionary_transliteration) if text == '῾' then return 'h'	end text = mw.ustring.toNFD(text) --		Replace semicolon or Greek question mark with regular question mark,		except after an ASCII alphanumeric character (to avoid converting		semicolons in HTML entities).	-- text = mw.ustring.gsub(text, "([^A-Za-z0-9])[;" .. mw.ustring.char(0x37E) .. "]", "%1?") -- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common. text = text:gsub("·", ";") local tokens = tokenize(text)

--now read the tokens local output = {} for i, token in pairs(tokens) do		-- substitute each character in the token for its transliteration local translit = string.gsub(mw.ustring.lower(token), UTF8_char, correspondences) if token == 'γ' and is_velar[tokens[i + 1]] then -- γ before a velar should be  translit = 'n'		elseif token == 'ρ' and tokens[i - 1] == 'ρ' then -- ρ after ρ should be  translit = 'rh' elseif system == "Wiktionary" and mw.ustring.find(token, '^[αΑ].*' .. subscript .. '$') then -- add macron to ᾳ translit = mw.ustring.gsub(translit, '([aA])', '%1' .. macron) end if token:find(rough) then if mw.ustring.find(token, '[Ρρ]') then translit = translit .. 'h'			else -- vowel translit = 'h' .. translit end end if system == "ALA-LC" and mw.ustring.find(token, '^[υΥ][^ιΙ]*$') then translit = translit:gsub('u', 'y'):gsub('U', 'Y') end -- Remove macron from a vowel that has a circumflex. if mw.ustring.find(translit, macron_circumflex) then translit = translit:gsub(macron, '') end -- Capitalize first character of transliteration. if token ~= mw.ustring.lower(token) then translit = mw.ustring.gsub(translit, "^.", mw.ustring.upper) end table.insert(output, translit) end return table.concat(output) end

function p.translit(frame) local text = frame.args[1] or frame:getParent.args[1] local system = frame.args.system if system == nil or system == "" then system = "Wiktionary" elseif not (system == "ALA-LC" or system == "Wiktionary") then error('Transliteration system in |system= not recognized; choose between "ALA-LC" and "Wiktionary"') end local transliteration = p.transliterate(text, system) return ' ''' .. transliteration .. ''' ' end

function p.bare_translit(frame) return p.transliterate(frame.args[1] or frame:getParent.args[1]) end

return p