Module:Sandbox/Erutuon/Unicode

local p = {}

local Unicode_data = require "Module:Unicode data/sandbox"

local function errorf(level, ...) if type(level) == "number" then return error(string.format(...), level + 1) else -- level is actually the format string. return error(string.format(level, ...), 2) end end

function mw.logf(...) return mw.log(string.format(...)) end

local output_mt = {} function output_mt:insert(str) self.n = self.n + 1 self[self.n] = str end

-- also in Module:Unicode data/documentation functions function output_mt:insert_format(...) self:insert(string.format(...)) end

output_mt.join = table.concat

output_mt.__index = output_mt

local function Output return setmetatable({ n = 0 }, output_mt) end

local Latn_pattern = table.concat { "[",	"\n\32-\127", "\194\160-\194\172",	"\195\128-\195\191",	"\196\128-\197\191",	"\198\128-\201\143",	"\225\184\128-\225\187\191",	"\226\177\160-\226\177\191",	"\234\156\160-\234\159\191",	"\234\172\176-\234\173\175",	"\239\172\128-\239\172\134",	"\239\188\129-\239\188\188",	"–",	"—",	"«", "»",	"]", };

local get_codepoint = mw.ustring.codepoint local function expand_range(start, ending) local lower, higher = get_codepoint(start), get_codepoint(ending) if higher < lower then return nil end local chars = {} local i = 0 for codepoint = lower, higher do		i = i + 1 chars[i] = mw.ustring.char(codepoint) end return table.concat(chars) end

local fun = require "Module:Fun" local m_table = require "Module:TableTools"

local script_to_count_mt = { __index = function (self, key) self[key] = 0 return 0 end, __call = function (self, ...) return setmetatable({}, self) end } setmetatable(script_to_count_mt, script_to_count_mt)

-- Uses an iterator (such as mw.ustring.gcodepoint) that generates a codepoint -- each time it is called with an optional state and another value. local function show_scripts(iterator, state, value) local script_to_count = script_to_count_mt for codepoint in iterator, state, value do		local script = Unicode_data.lookup_script(codepoint) script_to_count[script] = script_to_count[script] + 1 end return table.concat(		fun.mapIter( function (count, script) return ("%s (%d)"):format(script, count) end, m_table.sortedPairs(				script_to_count,				function (script1, script2)					return script_to_count[script1] > script_to_count[script2]				end)),		", ") end

local function get_chars_in_scripts(iterator, state, value) local script_to_char_set = {} for codepoint in iterator, state, value do		local script = Unicode_data.lookup_script(codepoint) script_to_char_set[script] = script_to_char_set[script] or {} script_to_char_set[script][codepoint] = true end return script_to_char_set end

local function print_char_set_map(script_to_char_set, format, separator) format = format or "%s: %s" separator = separator or "\n" return table.concat(		fun.mapIter( function (char_set, script) local char_list = fun.mapIter(					function (_, codepoint)						return mw.ustring.char(codepoint)					end,					m_table.sortedPairs(char_set)) return (format):format(script, mw.text.nowiki(table.concat(char_list))) end, m_table.sortedPairs(script_to_char_set)),		separator) end

function p.show(frame) local expanded_pattern = Latn_pattern :gsub("%[(.-)%]", "%1") :gsub( -- Find two UTF-8-encoded characters separated by hyphen-minus.			"([%z\1-\127\194-\244][\128-\191]*)%-([%z\1-\127\194-\244][\128-\191]*)",			function (char1, char2)				return expand_range(char1, char2)			end) return ('* %s %s') :format(expanded_pattern			:gsub("^%s*", ""), -- Remove initial "\n " to avoid creating unwanted pre element.			show_scripts(mw.ustring.gcodepoint(expanded_pattern))) end

local function get_block_info_from_arg(args, arg) local block_name = args[1] or errorf("Parameter %s is required", tostring(arg)) local block_info = Unicode_data.get_block_info(block_name) or errorf("The block '%s' could be found", block_name) return block_info end

local function get_boolean_from_arg(args, arg) return args[arg] and require "Module:Yesno" (args[arg]) end

function p.scripts_in_block(frame) local block_info = get_block_info_from_arg(frame.args, 1) local show_block_name = get_boolean_from_arg(frame.args, 2) local script_list = show_scripts(fun.range(block_info[1], block_info[2])) if show_block_name then return ("%s: %s"):format(block_info[3], script_list) else return script_list end end

local function link_block_name(block_name) if block_name:find " " then return ("%s"):format(block_name) else return ("%s"):format(block_name, block_name) end end

function p.scripts_in_blocks(frame) local output = Output local start = frame.args[1] and tonumber(frame.args[1], 16) or 0 local ending = frame.args[2] and tonumber(frame.args[2], 16) or 0x4000 local script_data = mw.loadData "Module:Unicode data/scripts" local singles = script_data.singles local ranges = script_data.ranges local function clear (self) for _, key in ipairs(m_table.keysToList(self, false)) do			self[key] = nil end end local counts = {} setmetatable(counts, {		__index = {			increment = function(self, script_code, amount)				self[script_code] = (self[script_code] or 0) + (amount or 1)			end,			clear = clear,		}	}) local codepoints_per_script = {} setmetatable(codepoints_per_script, {		__index = {			add = function(self, script_code, codepoint)				self[script_code] = self[script_code] or { n = 0 }				if self[script_code].n <= 0x20						and not (codepoint <= 0x9F and (codepoint >= 0x80						or codepoint <= 0x1F)) then					if self[script_code].n == 0x20 then						local period = ("."):byte						for _ = 1, 3 do							self[script_code].n = self[script_code].n + 1							self[script_code][self[script_code].n] = period						end					else						if script_code == "Zinh" then -- probably combining character							self[script_code].n = self[script_code].n + 1							self[script_code][self[script_code].n] = 0x25CC						end						self[script_code].n = self[script_code].n + 1						self[script_code][self[script_code].n] = codepoint					end				end			end,			clear = clear,		}	}) output:insert class="wikitable" ! block !! codepoints !! scripts for _, block in pairs(mw.loadData "Module:Unicode data/blocks") do		local codepoint = block[1] if codepoint > ending then break end if codepoint >= start then while codepoint <= block[2] do				local script = singles[codepoint] local count if script then -- Codepoint is in "singles" map. counts:increment(script) codepoints_per_script:add(script, codepoint) codepoint = codepoint + 1 count = 1 -- for potential future use else local range, index = Unicode_data.binary_range_search(codepoint, ranges) if range then -- Codepoint is in "ranges" array. count = 0 script = range[3] while codepoint <= range[2] and codepoint <= block[2] do							count = count + 1 codepoints_per_script:add(script, codepoint) codepoint = codepoint + 1 end counts:increment(script, count) else -- Codepoint doesn't have data; it's Zzzz. -- Get range immediately above codepoint. while ranges[index][2] < codepoint do							index = index + 1 end count = 0 script = "Zzzz" local range = ranges[index] while codepoint < range[1] and codepoint <= block[2] and not singles[codepoint] do							count = count + 1 codepoint = codepoint + 1 end counts:increment(script, count) end end end output:insert_format(, link_block_name(block[3]), block[1], block[2],				table.concat( fun.map(						function (count, script)							return ('%s ( %d )')								:format( script_data.aliases[script], script, codepoints_per_script[script] and mw.text.nowiki(mw.ustring.char( unpack(codepoints_per_script[script]))) or "", count)						end,						m_table.sortedPairs( counts, function (script1, script2) return counts[script1] > counts[script2] end)), ", "))		end -- mw.logObject(codepoints_per_script, block[3]) counts:clear codepoints_per_script:clear end output:insert "|}" return output:join end
 * + Scripts in each Unicode block
 * %s
 * U+%04X–U+%04X
 * %s
 * %s

function p.chars_in_scripts_in_block(frame) local block_info = get_block_info_from_arg(frame.args, 1) local show_block_name = get_boolean_from_arg(frame.args, 2) local script_char_set_map = print_char_set_map(		get_chars_in_scripts(fun.range(block_info[1], block_info[2]))) if show_block_name then return ("%s: %s"):format(block_info[3], script_char_set_map) else return script_char_set_map end end

function p.search_for_language_codes(frame) local page_name = frame.args[1] or "English language" local success, title_object = pcall(mw.title.new, page_name) if not (success and title_object) then mw.logf("Could not make title object for '%s'.", page_name) return end local content = title_object:getContent local language_codes = {} for lang_template in content:gmatch "{{lang[^}]+" do		local template_name = lang_template:match("{{([^|}]+)") local language_code if template_name == "lang" then language_code = lang_template:match "{{lang|([^|}]+)" elseif template_name:find "^lang-" then language_code = lang_template:match "{{lang-([^|}]+)" end if language_code then language_codes[language_code] = true end end return table.concat(m_table.keysToList(language_codes), ", ") end

return p