You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
206 lines
36 KiB
206 lines
36 KiB
|
2 years ago
|
-- "UTF8" by phanxaddons and pastamancer_wow (https://www.wowace.com/projects/utf8)
|
||
|
|
|
||
|
|
-- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
|
||
|
|
--
|
||
|
|
-- Provides UTF-8 aware string functions implemented in pure lua:
|
||
|
|
-- * string.utf8len(s)
|
||
|
|
-- * string.utf8sub(s, i, j)
|
||
|
|
-- * string.utf8reverse(s)
|
||
|
|
--
|
||
|
|
-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
|
||
|
|
-- additional functions are available:
|
||
|
|
-- * string.utf8upper(s)
|
||
|
|
-- * string.utf8lower(s)
|
||
|
|
--
|
||
|
|
-- All functions behave as their non UTF-8 aware counterparts with the exception
|
||
|
|
-- that UTF-8 characters are used instead of bytes for all units.
|
||
|
|
|
||
|
|
--[[
|
||
|
|
Copyright (c) 2006-2007, Kyle Smith
|
||
|
|
All rights reserved.
|
||
|
|
|
||
|
|
Redistribution and use in source and binary forms, with or without
|
||
|
|
modification, are permitted provided that the following conditions are met:
|
||
|
|
|
||
|
|
* Redistributions of source code must retain the above copyright notice,
|
||
|
|
this list of conditions and the following disclaimer.
|
||
|
|
* Redistributions in binary form must reproduce the above copyright
|
||
|
|
notice, this list of conditions and the following disclaimer in the
|
||
|
|
documentation and/or other materials provided with the distribution.
|
||
|
|
* Neither the name of the author nor the names of its contributors may be
|
||
|
|
used to endorse or promote products derived from this software without
|
||
|
|
specific prior written permission.
|
||
|
|
|
||
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||
|
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
|
--]]
|
||
|
|
|
||
|
|
-- ABNF from RFC 3629
|
||
|
|
--
|
||
|
|
-- UTF8-octets = *( UTF8-char )
|
||
|
|
-- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
|
||
|
|
-- UTF8-1 = %x00-7F
|
||
|
|
-- UTF8-2 = %xC2-DF UTF8-tail
|
||
|
|
-- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
|
||
|
|
-- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
|
||
|
|
-- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
||
|
|
-- %xF4 %x80-8F 2( UTF8-tail )
|
||
|
|
-- UTF8-tail = %x80-BF
|
||
|
|
--
|
||
|
|
|
||
|
|
local strbyte, strlen, strsub, type = string.byte, string.len, string.sub, type
|
||
|
|
|
||
|
|
local utf8_lc_uc = { ["a"] = "A", ["b"] = "B", ["c"] = "C", ["d"] = "D", ["e"] = "E", ["f"] = "F", ["g"] = "G", ["h"] = "H", ["i"] = "I", ["j"] = "J", ["k"] = "K", ["l"] = "L", ["m"] = "M", ["n"] = "N", ["o"] = "O", ["p"] = "P", ["q"] = "Q", ["r"] = "R", ["s"] = "S", ["t"] = "T", ["u"] = "U", ["v"] = "V", ["w"] = "W", ["x"] = "X", ["y"] = "Y", ["z"] = "Z", ["µ"] = "Μ", ["à"] = "À", ["á"] = "Á", ["â"] = "Â", ["ã"] = "Ã", ["ä"] = "Ä", ["å"] = "Å", ["æ"] = "Æ", ["ç"] = "Ç", ["è"] = "È", ["é"] = "É", ["ê"] = "Ê", ["ë"] = "Ë", ["ì"] = "Ì", ["í"] = "Í", ["î"] = "Î", ["ï"] = "Ï", ["ð"] = "Ð", ["ñ"] = "Ñ", ["ò"] = "Ò", ["ó"] = "Ó", ["ô"] = "Ô", ["õ"] = "Õ", ["ö"] = "Ö", ["ø"] = "Ø", ["ù"] = "Ù", ["ú"] = "Ú", ["û"] = "Û", ["ü"] = "Ü", ["ý"] = "Ý", ["þ"] = "Þ", ["ÿ"] = "Ÿ", ["ā"] = "Ā", ["ă"] = "Ă", ["ą"] = "Ą", ["ć"] = "Ć", ["ĉ"] = "Ĉ", ["ċ"] = "Ċ", ["č"] = "Č", ["ď"] = "Ď", ["đ"] = "Đ", ["ē"] = "Ē", ["ĕ"] = "Ĕ", ["ė"] = "Ė", ["ę"] = "Ę", ["ě"] = "Ě", ["ĝ"] = "Ĝ", ["ğ"] = "Ğ", ["ġ"] = "Ġ", ["ģ"] = "Ģ", ["ĥ"] = "Ĥ", ["ħ"] = "Ħ", ["ĩ"] = "Ĩ", ["ī"] = "Ī", ["ĭ"] = "Ĭ", ["į"] = "Į", ["ı"] = "I", ["ij"] = "IJ", ["ĵ"] = "Ĵ", ["ķ"] = "Ķ", ["ĺ"] = "Ĺ", ["ļ"] = "Ļ", ["ľ"] = "Ľ", ["ŀ"] = "Ŀ", ["ł"] = "Ł", ["ń"] = "Ń", ["ņ"] = "Ņ", ["ň"] = "Ň", ["ŋ"] = "Ŋ", ["ō"] = "Ō", ["ŏ"] = "Ŏ", ["ő"] = "Ő", ["œ"] = "Œ", ["ŕ"] = "Ŕ", ["ŗ"] = "Ŗ", ["ř"] = "Ř", ["ś"] = "Ś", ["ŝ"] = "Ŝ", ["ş"] = "Ş", ["š"] = "Š", ["ţ"] = "Ţ", ["ť"] = "Ť", ["ŧ"] = "Ŧ", ["ũ"] = "Ũ", ["ū"] = "Ū", ["ŭ"] = "Ŭ", ["ů"] = "Ů", ["ű"] = "Ű", ["ų"] = "Ų", ["ŵ"] = "Ŵ", ["ŷ"] = "Ŷ", ["ź"] = "Ź", ["ż"] = "Ż", ["ž"] = "Ž", ["ſ"] = "S", ["ƀ"] = "Ƀ", ["ƃ"] = "Ƃ", ["ƅ"] = "Ƅ", ["ƈ"] = "Ƈ", ["ƌ"] = "Ƌ", ["ƒ"] = "Ƒ", ["ƕ"] = "Ƕ", ["ƙ"] = "Ƙ", ["ƚ"] = "Ƚ", ["ƞ"] = "Ƞ", ["ơ"] = "Ơ", ["ƣ"] = "Ƣ", ["ƥ"] = "Ƥ", ["ƨ"] = "Ƨ", ["ƭ"] = "Ƭ", ["ư"] = "Ư", ["ƴ"] = "Ƴ", ["ƶ"] = "Ƶ", ["ƹ"] = "Ƹ", ["ƽ"] = "Ƽ", ["ƿ"] = "Ƿ", ["Dž"] = "DŽ", ["dž"] = "DŽ", ["Lj"] = "LJ", ["lj"] = "LJ", ["Nj"] = "NJ", ["nj"] = "NJ", ["ǎ"] = "Ǎ", ["ǐ"] = "Ǐ", ["ǒ"] = "Ǒ", ["ǔ"] = "Ǔ", ["ǖ"] = "Ǖ", ["ǘ"] = "Ǘ", ["ǚ"] = "Ǚ", ["ǜ"] = "Ǜ", ["ǝ"] = "Ǝ", ["ǟ"] = "Ǟ", ["ǡ"] = "Ǡ", ["ǣ"] = "Ǣ", ["ǥ"] = "Ǥ", ["ǧ"] = "Ǧ", ["ǩ"] = "Ǩ", ["ǫ"] = "Ǫ", ["ǭ"] = "Ǭ", ["ǯ"] = "Ǯ", ["Dz"] = "DZ", ["dz"] = "DZ", ["ǵ"] = "Ǵ", ["ǹ"] = "Ǹ", ["ǻ"] = "Ǻ", ["ǽ"] = "Ǽ", ["ǿ"] = "Ǿ", ["ȁ"] = "Ȁ", ["ȃ"] = "Ȃ", ["ȅ"] = "Ȅ", ["ȇ"] = "Ȇ", ["ȉ"] = "Ȉ", ["ȋ"] = "Ȋ", ["ȍ"] = "Ȍ", ["ȏ"] = "Ȏ", ["ȑ"] = "Ȑ", ["ȓ"] = "Ȓ", ["ȕ"] = "Ȕ", ["ȗ"] = "Ȗ", ["ș"] = "Ș", ["ț"] = "Ț", ["ȝ"] = "Ȝ", ["ȟ"] = "Ȟ", ["ȣ"] = "Ȣ", ["ȥ"] = "Ȥ", ["ȧ"] = "Ȧ", ["ȩ"] = "Ȩ", ["ȫ"] = "Ȫ", ["ȭ"] = "Ȭ", ["ȯ"] = "Ȯ", ["ȱ"] = "Ȱ", ["ȳ"] = "Ȳ", ["ȼ"] = "Ȼ", ["ɂ"] = "Ɂ", ["ɇ"] = "Ɇ", ["ɉ"] = "Ɉ", ["ɋ"] = "Ɋ", ["ɍ"] = "Ɍ", ["ɏ"] = "Ɏ", ["ɓ"] = "Ɓ", ["ɔ"] = "Ɔ", ["ɖ"] = "Ɖ", ["ɗ"] = "Ɗ", ["ə"] = "Ə", ["ɛ"] = "Ɛ", ["ɠ"] = "Ɠ", ["ɣ"] = "Ɣ", ["ɨ"] = "Ɨ", ["ɩ"] = "Ɩ", ["ɫ"] = "Ɫ", ["ɯ"] = "Ɯ", ["ɲ"] = "Ɲ", ["ɵ"] = "Ɵ", ["ɽ"] = "Ɽ", ["ʀ"] = "Ʀ", ["ʃ"] = "Ʃ", ["ʈ"] = "Ʈ", ["ʉ"] = "Ʉ", ["ʊ"] = "Ʊ", ["ʋ"] = "Ʋ", ["ʌ"] = "Ʌ", ["ʒ"] = "Ʒ", ["ͅ"] = "Ι", ["ͻ"] = "Ͻ", ["ͼ"] = "Ͼ", ["ͽ"] = "Ͽ", ["ά"] = "Ά", ["έ"] = "Έ", ["ή"] = "Ή", ["ί"] = "Ί", ["α"] = "Α", ["β"] = "Β", ["γ"] = "Γ", ["δ"] = "Δ", ["ε"] = "Ε", ["ζ"] = "Ζ", ["η"] = "Η", ["θ"] = "Θ", ["ι"] = "Ι", ["κ"] = "Κ", ["λ"] = "Λ", ["μ"] = "Μ", ["ν"] = "Ν", ["ξ"] = "Ξ", ["ο"] = "Ο", ["π"] = "Π", ["ρ"] = "Ρ", ["ς"] = "Σ", ["σ"] = "Σ", ["τ"] = "Τ", ["υ"] = "Υ", ["φ"] = "Φ", ["χ"] = "Χ", ["ψ"] = "Ψ", ["ω"] = "Ω", ["ϊ"] = "Ϊ", ["ϋ"] = "Ϋ", ["ό"] = "Ό", ["ύ"] = "Ύ", ["ώ"] = "Ώ", ["ϐ"] = "Β", ["ϑ"] = "Θ", ["ϕ"] = "Φ", ["ϖ"] = "Π", ["ϙ"] = "Ϙ", ["ϛ"] = "Ϛ", ["ϝ"] = "Ϝ", ["ϟ"] = "Ϟ", ["ϡ"] = "Ϡ", [
|
||
|
|
|
||
|
|
local utf8_uc_lc = { ["A"] = "a", ["B"] = "b", ["C"] = "c", ["D"] = "d", ["E"] = "e", ["F"] = "f", ["G"] = "g", ["H"] = "h", ["I"] = "i", ["J"] = "j", ["K"] = "k", ["L"] = "l", ["M"] = "m", ["N"] = "n", ["O"] = "o", ["P"] = "p", ["Q"] = "q", ["R"] = "r", ["S"] = "s", ["T"] = "t", ["U"] = "u", ["V"] = "v", ["W"] = "w", ["X"] = "x", ["Y"] = "y", ["Z"] = "z", ["À"] = "à", ["Á"] = "á", ["Â"] = "â", ["Ã"] = "ã", ["Ä"] = "ä", ["Å"] = "å", ["Æ"] = "æ", ["Ç"] = "ç", ["È"] = "è", ["É"] = "é", ["Ê"] = "ê", ["Ë"] = "ë", ["Ì"] = "ì", ["Í"] = "í", ["Î"] = "î", ["Ï"] = "ï", ["Ð"] = "ð", ["Ñ"] = "ñ", ["Ò"] = "ò", ["Ó"] = "ó", ["Ô"] = "ô", ["Õ"] = "õ", ["Ö"] = "ö", ["Ø"] = "ø", ["Ù"] = "ù", ["Ú"] = "ú", ["Û"] = "û", ["Ü"] = "ü", ["Ý"] = "ý", ["Þ"] = "þ", ["Ā"] = "ā", ["Ă"] = "ă", ["Ą"] = "ą", ["Ć"] = "ć", ["Ĉ"] = "ĉ", ["Ċ"] = "ċ", ["Č"] = "č", ["Ď"] = "ď", ["Đ"] = "đ", ["Ē"] = "ē", ["Ĕ"] = "ĕ", ["Ė"] = "ė", ["Ę"] = "ę", ["Ě"] = "ě", ["Ĝ"] = "ĝ", ["Ğ"] = "ğ", ["Ġ"] = "ġ", ["Ģ"] = "ģ", ["Ĥ"] = "ĥ", ["Ħ"] = "ħ", ["Ĩ"] = "ĩ", ["Ī"] = "ī", ["Ĭ"] = "ĭ", ["Į"] = "į", ["İ"] = "i", ["IJ"] = "ij", ["Ĵ"] = "ĵ", ["Ķ"] = "ķ", ["Ĺ"] = "ĺ", ["Ļ"] = "ļ", ["Ľ"] = "ľ", ["Ŀ"] = "ŀ", ["Ł"] = "ł", ["Ń"] = "ń", ["Ņ"] = "ņ", ["Ň"] = "ň", ["Ŋ"] = "ŋ", ["Ō"] = "ō", ["Ŏ"] = "ŏ", ["Ő"] = "ő", ["Œ"] = "œ", ["Ŕ"] = "ŕ", ["Ŗ"] = "ŗ", ["Ř"] = "ř", ["Ś"] = "ś", ["Ŝ"] = "ŝ", ["Ş"] = "ş", ["Š"] = "š", ["Ţ"] = "ţ", ["Ť"] = "ť", ["Ŧ"] = "ŧ", ["Ũ"] = "ũ", ["Ū"] = "ū", ["Ŭ"] = "ŭ", ["Ů"] = "ů", ["Ű"] = "ű", ["Ų"] = "ų", ["Ŵ"] = "ŵ", ["Ŷ"] = "ŷ", ["Ÿ"] = "ÿ", ["Ź"] = "ź", ["Ż"] = "ż", ["Ž"] = "ž", ["Ɓ"] = "ɓ", ["Ƃ"] = "ƃ", ["Ƅ"] = "ƅ", ["Ɔ"] = "ɔ", ["Ƈ"] = "ƈ", ["Ɖ"] = "ɖ", ["Ɗ"] = "ɗ", ["Ƌ"] = "ƌ", ["Ǝ"] = "ǝ", ["Ə"] = "ə", ["Ɛ"] = "ɛ", ["Ƒ"] = "ƒ", ["Ɠ"] = "ɠ", ["Ɣ"] = "ɣ", ["Ɩ"] = "ɩ", ["Ɨ"] = "ɨ", ["Ƙ"] = "ƙ", ["Ɯ"] = "ɯ", ["Ɲ"] = "ɲ", ["Ɵ"] = "ɵ", ["Ơ"] = "ơ", ["Ƣ"] = "ƣ", ["Ƥ"] = "ƥ", ["Ʀ"] = "ʀ", ["Ƨ"] = "ƨ", ["Ʃ"] = "ʃ", ["Ƭ"] = "ƭ", ["Ʈ"] = "ʈ", ["Ư"] = "ư", ["Ʊ"] = "ʊ", ["Ʋ"] = "ʋ", ["Ƴ"] = "ƴ", ["Ƶ"] = "ƶ", ["Ʒ"] = "ʒ", ["Ƹ"] = "ƹ", ["Ƽ"] = "ƽ", ["DŽ"] = "dž", ["Dž"] = "dž", ["LJ"] = "lj", ["Lj"] = "lj", ["NJ"] = "nj", ["Nj"] = "nj", ["Ǎ"] = "ǎ", ["Ǐ"] = "ǐ", ["Ǒ"] = "ǒ", ["Ǔ"] = "ǔ", ["Ǖ"] = "ǖ", ["Ǘ"] = "ǘ", ["Ǚ"] = "ǚ", ["Ǜ"] = "ǜ", ["Ǟ"] = "ǟ", ["Ǡ"] = "ǡ", ["Ǣ"] = "ǣ", ["Ǥ"] = "ǥ", ["Ǧ"] = "ǧ", ["Ǩ"] = "ǩ", ["Ǫ"] = "ǫ", ["Ǭ"] = "ǭ", ["Ǯ"] = "ǯ", ["DZ"] = "dz", ["Dz"] = "dz", ["Ǵ"] = "ǵ", ["Ƕ"] = "ƕ", ["Ƿ"] = "ƿ", ["Ǹ"] = "ǹ", ["Ǻ"] = "ǻ", ["Ǽ"] = "ǽ", ["Ǿ"] = "ǿ", ["Ȁ"] = "ȁ", ["Ȃ"] = "ȃ", ["Ȅ"] = "ȅ", ["Ȇ"] = "ȇ", ["Ȉ"] = "ȉ", ["Ȋ"] = "ȋ", ["Ȍ"] = "ȍ", ["Ȏ"] = "ȏ", ["Ȑ"] = "ȑ", ["Ȓ"] = "ȓ", ["Ȕ"] = "ȕ", ["Ȗ"] = "ȗ", ["Ș"] = "ș", ["Ț"] = "ț", ["Ȝ"] = "ȝ", ["Ȟ"] = "ȟ", ["Ƞ"] = "ƞ", ["Ȣ"] = "ȣ", ["Ȥ"] = "ȥ", ["Ȧ"] = "ȧ", ["Ȩ"] = "ȩ", ["Ȫ"] = "ȫ", ["Ȭ"] = "ȭ", ["Ȯ"] = "ȯ", ["Ȱ"] = "ȱ", ["Ȳ"] = "ȳ", ["Ⱥ"] = "ⱥ", ["Ȼ"] = "ȼ", ["Ƚ"] = "ƚ", ["Ⱦ"] = "ⱦ", ["Ɂ"] = "ɂ", ["Ƀ"] = "ƀ", ["Ʉ"] = "ʉ", ["Ʌ"] = "ʌ", ["Ɇ"] = "ɇ", ["Ɉ"] = "ɉ", ["Ɋ"] = "ɋ", ["Ɍ"] = "ɍ", ["Ɏ"] = "ɏ", ["Ά"] = "ά", ["Έ"] = "έ", ["Ή"] = "ή", ["Ί"] = "ί", ["Ό"] = "ό", ["Ύ"] = "ύ", ["Ώ"] = "ώ", ["Α"] = "α", ["Β"] = "β", ["Γ"] = "γ", ["Δ"] = "δ", ["Ε"] = "ε", ["Ζ"] = "ζ", ["Η"] = "η", ["Θ"] = "θ", ["Ι"] = "ι", ["Κ"] = "κ", ["Λ"] = "λ", ["Μ"] = "μ", ["Ν"] = "ν", ["Ξ"] = "ξ", ["Ο"] = "ο", ["Π"] = "π", ["Ρ"] = "ρ", ["Σ"] = "σ", ["Τ"] = "τ", ["Υ"] = "υ", ["Φ"] = "φ", ["Χ"] = "χ", ["Ψ"] = "ψ", ["Ω"] = "ω", ["Ϊ"] = "ϊ", ["Ϋ"] = "ϋ", ["Ϙ"] = "ϙ", ["Ϛ"] = "ϛ", ["Ϝ"] = "ϝ", ["Ϟ"] = "ϟ", ["Ϡ"] = "ϡ", ["Ϣ"] = "ϣ", ["Ϥ"] = "ϥ", ["Ϧ"] = "ϧ", ["Ϩ"] = "ϩ", ["Ϫ"] = "ϫ", ["Ϭ"] = "ϭ", ["Ϯ"] = "ϯ", ["ϴ"] = "θ", ["Ϸ"] = "ϸ", ["Ϲ"] = "ϲ", ["Ϻ"] = "ϻ",
|
||
|
|
|
||
|
|
-- returns the number of bytes used by the UTF-8 character at byte i in s
|
||
|
|
-- also doubles as a UTF-8 character validator
|
||
|
|
local function utf8charbytes(s, i)
|
||
|
|
-- argument defaults
|
||
|
|
i = i or 1
|
||
|
|
|
||
|
|
-- argument checking
|
||
|
|
if type(s) ~= "string" then
|
||
|
|
error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
|
||
|
|
end
|
||
|
|
if type(i) ~= "number" then
|
||
|
|
error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
|
||
|
|
end
|
||
|
|
|
||
|
|
local c = strbyte(s, i)
|
||
|
|
|
||
|
|
-- determine bytes needed for character, based on RFC 3629
|
||
|
|
-- validate byte 1
|
||
|
|
if c > 0 and c <= 127 then
|
||
|
|
-- UTF8-1
|
||
|
|
return 1
|
||
|
|
|
||
|
|
elseif c >= 194 and c <= 223 then
|
||
|
|
-- UTF8-2
|
||
|
|
local c2 = strbyte(s, i + 1)
|
||
|
|
|
||
|
|
if not c2 then
|
||
|
|
error("UTF-8 string terminated early")
|
||
|
|
end
|
||
|
|
|
||
|
|
-- validate byte 2
|
||
|
|
if c2 < 128 or c2 > 191 then
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
end
|
||
|
|
|
||
|
|
return 2
|
||
|
|
|
||
|
|
elseif c >= 224 and c <= 239 then
|
||
|
|
-- UTF8-3
|
||
|
|
local c2 = strbyte(s, i + 1)
|
||
|
|
local c3 = strbyte(s, i + 2)
|
||
|
|
|
||
|
|
if not c2 or not c3 then
|
||
|
|
error("UTF-8 string terminated early")
|
||
|
|
end
|
||
|
|
|
||
|
|
-- validate byte 2
|
||
|
|
if c == 224 and (c2 < 160 or c2 > 191) then
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
elseif c == 237 and (c2 < 128 or c2 > 159) then
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
elseif c2 < 128 or c2 > 191 then
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
end
|
||
|
|
|
||
|
|
-- validate byte 3
|
||
|
|
if c3 < 128 or c3 > 191 then
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
end
|
||
|
|
|
||
|
|
return 3
|
||
|
|
|
||
|
|
elseif c >= 240 and c <= 244 then
|
||
|
|
-- UTF8-4
|
||
|
|
local c2 = strbyte(s, i + 1)
|
||
|
|
local c3 = strbyte(s, i + 2)
|
||
|
|
local c4 = strbyte(s, i + 3)
|
||
|
|
|
||
|
|
if not c2 or not c3 or not c4 then
|
||
|
|
error("UTF-8 string terminated early")
|
||
|
|
end
|
||
|
|
|
||
|
|
-- validate byte 2
|
||
|
|
if c == 240 and (c2 < 144 or c2 > 191) then
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
elseif c == 244 and (c2 < 128 or c2 > 143) then
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
elseif c2 < 128 or c2 > 191 then
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
end
|
||
|
|
|
||
|
|
-- validate byte 3
|
||
|
|
if c3 < 128 or c3 > 191 then
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
end
|
||
|
|
|
||
|
|
-- validate byte 4
|
||
|
|
if c4 < 128 or c4 > 191 then
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
end
|
||
|
|
|
||
|
|
return 4
|
||
|
|
|
||
|
|
else
|
||
|
|
error("Invalid UTF-8 character")
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
-- replace UTF-8 characters based on a mapping table
|
||
|
|
local function utf8replace(s, mapping)
|
||
|
|
-- argument checking
|
||
|
|
if type(s) ~= "string" then
|
||
|
|
error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
|
||
|
|
end
|
||
|
|
if type(mapping) ~= "table" then
|
||
|
|
error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
|
||
|
|
end
|
||
|
|
|
||
|
|
local pos = 1
|
||
|
|
local bytes = strlen(s)
|
||
|
|
local charbytes
|
||
|
|
local newstr = ""
|
||
|
|
|
||
|
|
while pos <= bytes do
|
||
|
|
charbytes = utf8charbytes(s, pos)
|
||
|
|
local c = strsub(s, pos, pos + charbytes - 1)
|
||
|
|
|
||
|
|
newstr = newstr .. (mapping[c] or c)
|
||
|
|
|
||
|
|
pos = pos + charbytes
|
||
|
|
end
|
||
|
|
|
||
|
|
return newstr
|
||
|
|
end
|
||
|
|
|
||
|
|
-- identical to string.upper except it knows about unicode simple case conversions
|
||
|
|
local function utf8upper(s)
|
||
|
|
return utf8replace(s, utf8_lc_uc)
|
||
|
|
end
|
||
|
|
|
||
|
|
-- identical to string.lower except it knows about unicode simple case conversions
|
||
|
|
local function utf8lower(s)
|
||
|
|
return utf8replace(s, utf8_uc_lc)
|
||
|
|
end
|
||
|
|
|
||
|
|
-- expose in the addon shared table
|
||
|
|
local ns = select(2, ...) ---@class ns @The addon namespace.
|
||
|
|
ns.utf8 = {
|
||
|
|
utf8upper = utf8upper,
|
||
|
|
utf8lower = utf8lower,
|
||
|
|
}
|