Here's a function that takes a Unicode code point and returns the corresponding UTF-8 encoded character bytes. Example:
utf8.encode(0x265c) -- returns '♜'
Please feel free to edit or revert if it's not your style. It might not be relevant to Quickie, though it's a handy UTF-8 utility.
168 lines
4.4 KiB
Lua
168 lines
4.4 KiB
Lua
-- utf8.lua - Basic (and unsafe) utf8 string support in plain Lua - public domain
|
|
--
|
|
-- Written in 2013 by Matthias Richter (vrld@vrld.org)
|
|
--
|
|
-- This software is in the public domain. Where that dedication is not
|
|
-- recognized, you are granted a perpetual, irrevokable license to copy and
|
|
-- modify this file as you see fit. This software is distributed without any
|
|
-- warranty.
|
|
|
|
-- !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
-- ALL FUNCTIONS ARE UNSAFE: THEY ASSUME VALID UTF8 INPUT
|
|
-- !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
|
|
-- Generic for iterator.
|
|
--
|
|
-- Arguments:
|
|
-- s ... The utf8 string.
|
|
-- i ... Last byte of the previous codepoint.
|
|
--
|
|
-- Returns:
|
|
-- k ... Number of the *last* byte of the codepoint.
|
|
-- c ... The utf8 codepoint (character).
|
|
-- n ... Width/number of bytes of the codepoint.
|
|
local function iter(s, i)
|
|
if i >= #s then return end
|
|
local b, nbytes = s:byte(i+1,i+1), 1
|
|
|
|
-- determine width of the codepoint by counting the number of set bits in the first byte
|
|
-- warning: there is no validation of the following bytes!
|
|
if b >= 0xc0 and b <= 0xdf then nbytes = 2 -- 1100 0000 to 1101 1111
|
|
elseif b >= 0xe0 and b <= 0xef then nbytes = 3 -- 1110 0000 to 1110 1111
|
|
elseif b >= 0xf0 and b <= 0xf7 then nbytes = 4 -- 1111 0000 to 1111 0111
|
|
elseif b >= 0xf8 and b <= 0xfb then nbytes = 5 -- 1111 1000 to 1111 1011
|
|
elseif b >= 0xfc and b <= 0xfd then nbytes = 6 -- 1111 1100 to 1111 1101
|
|
elseif b < 0x00 or b > 0x7f then error(("Invalid codepoint: 0x%02x"):format(b))
|
|
end
|
|
return i+nbytes, s:sub(i+1,i+nbytes), nbytes
|
|
end
|
|
|
|
-- Shortcut to the generic for iterator.
|
|
--
|
|
-- Usage:
|
|
-- for k, c, n in chars(s) do
|
|
-- ...
|
|
-- end
|
|
--
|
|
-- Meaning of k, c, and n is the same as in iter(s, i).
|
|
local function chars(s)
|
|
return iter, s, 0
|
|
end
|
|
|
|
-- Get length in characters of an utf8 string.
|
|
--
|
|
-- Arguments:
|
|
-- s ... The utf8 string.
|
|
--
|
|
-- Returns:
|
|
-- n ... Number of utf8 characters in s.
|
|
local function len(s)
|
|
-- assumes sane utf8 string: count the number of bytes that is *not* 10xxxxxx
|
|
local _, c = s:gsub('[^\128-\191]', '')
|
|
return c
|
|
end
|
|
|
|
-- Get substring, same semantics as string.sub(s,i,j).
|
|
--
|
|
-- Arguments:
|
|
-- s ... The utf8 string.
|
|
-- i ... Starting position, may be negative.
|
|
-- j ... (optional) Ending position, may be negative.
|
|
--
|
|
-- Returns:
|
|
-- t ... The substring.
|
|
local function sub(s, i, j)
|
|
local l = len(s)
|
|
j = j or l
|
|
if i < 0 then i = l + i + 1 end
|
|
if j < 0 then j = l + j + 1 end
|
|
if j < i then return '' end
|
|
|
|
local k, t = 1, {}
|
|
for _, c in chars(s) do
|
|
if k >= i then t[#t+1] = c end
|
|
if k >= j then break end
|
|
k = k + 1
|
|
end
|
|
return table.concat(t)
|
|
end
|
|
|
|
-- Split utf8 string in two substrings
|
|
--
|
|
-- Arguments:
|
|
-- s ... The utf8 string.
|
|
-- i ... The position to split, may be negative.
|
|
--
|
|
-- Returns:
|
|
-- left ... Substring before i.
|
|
-- right ... Substring after i.
|
|
local function split(s, i)
|
|
local l = len(s)
|
|
if i < 0 then i = l + i + 1 end
|
|
|
|
local k, pos = 1, 0
|
|
for byte in chars(s) do
|
|
if k > i then break end
|
|
pos, k = byte, k + 1
|
|
end
|
|
return s:sub(1, pos), s:sub(pos+1, -1)
|
|
end
|
|
|
|
-- Reverses order of characters in an utf8 string.
|
|
--
|
|
-- Arguments:
|
|
-- s ... The utf8 string.
|
|
--
|
|
-- Returns:
|
|
-- t ... The revered string.
|
|
local function reverse(s)
|
|
local t = {}
|
|
for _, c in chars(s) do
|
|
table.insert(t, 1, c)
|
|
end
|
|
return table.concat(t)
|
|
end
|
|
|
|
-- Convert a Unicode code point to a UTF-8 byte sequence
|
|
-- Logic stolen from this page:
|
|
-- http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=iws-appendixa
|
|
--
|
|
-- Arguments:
|
|
-- Number representing the Unicode code point (e.g. 0x265c).
|
|
--
|
|
-- Returns:
|
|
-- UTF-8 encoded string of the given character.
|
|
-- Numbers out of range produce a blank string.
|
|
local function encode(code)
|
|
if code < 0 then
|
|
error('Code point must not be negative.')
|
|
elseif code <= 0x7f then
|
|
return string.char(code)
|
|
elseif code <= 0x7ff then
|
|
local c1 = code / 64 + 192
|
|
local c2 = code % 64 + 128
|
|
return string.char(c1, c2)
|
|
elseif code <= 0xffff then
|
|
local c1 = code / 4096 + 224
|
|
local c2 = code % 4096 / 64 + 128
|
|
local c3 = code % 64 + 128
|
|
return string.char(c1, c2, c3)
|
|
elseif code <= 0x10ffff then
|
|
local c1 = code / 262144 + 240
|
|
local c2 = code % 262144 / 4096 + 128
|
|
local c3 = code % 4096 / 64 + 128
|
|
local c4 = code % 64 + 128
|
|
return string.char(c1, c2, c3, c4)
|
|
end
|
|
return ''
|
|
end
|
|
|
|
return {
|
|
iter = iter,
|
|
chars = chars,
|
|
len = len,
|
|
sub = sub,
|
|
split = split,
|
|
reverse = reverse,
|
|
encode = encode
|
|
}
|