Module:scripts/findBestScript

Documentation for this module may be created at Module:scripts/findBestScript/doc
return function (export, text, lang, scripts, forceDetect)
	-- Ensure that "Hant", "Hans" and "Hani" are moved to the end of the list (in that order, if present), as they are a special-case.
	local oldScripts, Hant, Hans, Hani, finalCheck = scripts
	scripts = {}
	for _, script in ipairs(oldScripts) do
		if script._code == "Hant" then
			Hant = script
		elseif script._code == "Hans" then
			Hans = script
		elseif script._code == "Hani" then
			Hani = script
		else
			table.insert(scripts, script)
		end
	end
	if Hant then table.insert(scripts, Hant); finalCheck = true end
	if Hans then table.insert(scripts, Hans); finalCheck = true end
	if Hani then table.insert(scripts, Hani) end
	
	--[=[
		Remove any HTML entities; catfix function in [[Module:utilities]]
		adds tagging to a no-break space (&nbsp;), which contains Latin characters;
		hence Latin was returned as the script if "Latn" is one of the language's scripts.
	]=]
	text = string.gsub(text, "&[a-zA-Z0-9]+;", "")
	
	-- Try to match every script against the text,
	-- and return the one with the most matching characters.
	local bestcount, bestscript = 0
	
	-- Remove any spacing or punctuation characters, and get resultant length.
	-- Counting instances of UTF-8 character pattern is faster than mw.ustring.len.
	local reducedText = mw.ustring.gsub(text, "[%s%p]+", "")
	local _, length = string.gsub(reducedText, "[\1-\127\194-\244][\128-\191]*", "")
	
	-- If the length is 0 then we're probably dealing with a punctuation character, so only remove spacing characters, in case it is script-specific.
	if length == 0 then
		reducedText = mw.ustring.gsub(text, "[%s]+", "")
		_, length = string.gsub(reducedText, "[\1-\127\194-\244][\128-\191]*", "")
		
		if length == 0 then
			return export.getByCode("None")
		end
	end
	
	for i, script in ipairs(scripts) do
		local count = script:countCharacters(reducedText)
		
		-- Special case for "Hant", "Hans" and "Hani", which are returned if they match at least one character, under the assumption that (1) traditional and simplified characters will not be mixed if a language uses both scripts, and (2) any terms using Han characters with another script (e.g. Latin) will still need a Han code (not counting those which use Jpan or Kore). This is for efficiency, due to the special checks required for "Hant" and "Hans", and to prevent "Hani" from overriding either, as it will always match with at least as many characters, while characters used in both will only match with "Hani".
		if count >= length or ((script._code == "Hant" or script._code == "Hans" or script._code == "Hani") and count > 0) then
			return script
		elseif count > bestcount then
			bestcount = count
			bestscript = script
		end
	end
	
	-- Secondary check for languages that have "Hant" or "Hans" but not "Hani", but which still have multiple scripts (e.g. Macau Pidgin Portuguese): characters which are not exclusively traditional or simplified will not be found by the main check, so a separate "Hani" check is necessary to see if Han characters are present at all. If successful, return "Hant" or "Hans" as applicable.
	if finalCheck then
		for _, script in ipairs(scripts) do
			if script._code == "Hant" or script._code == "Hans" then
				if require("Module:scripts").getByCode("Hani"):countCharacters(reducedText) > 0 then return script end
			end
		end
	end
	
	if bestscript then
		return bestscript
	end
	
	-- No matching script was found, so return "None".
	return export.getByCode("None")
end