suit/utf8.lua

-- utf8.lua - Basic (and unsafe) utf8 string support in plain Lua - public domain
--
-- Written in 2013 by Matthias Richter (vrld@vrld.org)
--
-- This software is in the public domain. Where that dedication is not
-- recognized, you are granted a perpetual, irrevokable license to copy and
-- modify this file as you see fit. This software is distributed without any
-- warranty.

-- !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
--  ALL FUNCTIONS ARE UNSAFE: THEY ASSUME VALID UTF8 INPUT
-- !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

-- Generic for iterator.
--
-- Arguments:
--     s ... The utf8 string.
--     i ... Last byte of the previous codepoint.
--
-- Returns:
--     k ... Number of the *last* byte of the codepoint.
--     c ... The utf8 codepoint (character).
--     n ... Width/number of bytes of the codepoint.
local function iter(s, i)
	if i >= #s then return end
	local b, nbytes = s:byte(i+1,i+1), 1

	-- determine width of the codepoint by counting the number of set bits in the first byte
	-- warning: there is no validation of the following bytes!
	if     b >= 0xc0 and b <= 0xdf then nbytes = 2 -- 1100 0000 to 1101 1111
	elseif b >= 0xe0 and b <= 0xef then nbytes = 3 -- 1110 0000 to 1110 1111
	elseif b >= 0xf0 and b <= 0xf7 then nbytes = 4 -- 1111 0000 to 1111 0111
	elseif b >= 0xf8 and b <= 0xfb then nbytes = 5 -- 1111 1000 to 1111 1011
	elseif b >= 0xfc and b <= 0xfd then nbytes = 6 -- 1111 1100 to 1111 1101
	elseif b <  0x00 or  b >  0x7f then error(("Invalid codepoint: 0x%02x"):format(b))
	end
	return i+nbytes, s:sub(i+1,i+nbytes), nbytes
end

-- Shortcut to the generic for iterator.
--
-- Usage:
--    for k, c, n in chars(s) do
--        ...
--    end
--
--    Meaning of k, c, and n is the same as in iter(s, i).
local function chars(s)
	return iter, s, 0
end

-- Get length in characters of an utf8 string.
--
-- Arguments:
--     s ... The utf8 string.
--
-- Returns:
--     n ... Number of utf8 characters in s.
local function len(s)
	-- assumes sane utf8 string: count the number of bytes that is *not* 10xxxxxx
	local _, c = s:gsub('[^\128-\191]', '')
	return c
end

-- Get substring, same semantics as string.sub(s,i,j).
--
-- Arguments:
--     s ... The utf8 string.
--     i ... Starting position, may be negative.
--     j ... (optional) Ending position, may be negative.
--
-- Returns:
--     t ... The substring.
local function sub(s, i, j)
	local l = len(s)
	j = j or l
	if i < 0 then i = l + i + 1 end
	if j < 0 then j = l + j + 1 end
	if j < i then return '' end

	local k, t = 1, {}
	for _, c in chars(s) do
		if k >= i then t[#t+1] = c end
		if k >= j then break end
		k = k + 1
	end
	return table.concat(t)
end

-- Split utf8 string in two substrings
--
-- Arguments:
--     s ... The utf8 string.
--     i ... The position to split, may be negative.
--
-- Returns:
--     left  ... Substring before i.
--     right ... Substring after i.
local function split(s, i)
	local l = len(s)
	if i < 0 then i = l + i + 1 end

	local k, pos = 1, 0
	for byte in chars(s) do
		if k > i then break end
		pos, k = byte, k + 1
	end
	return s:sub(1, pos), s:sub(pos+1, -1)
end

-- Reverses order of characters in an utf8 string.
--
-- Arguments:
--     s ... The utf8 string.
--
-- Returns:
--     t ... The revered string.
local function reverse(s)
	local t = {}
	for _, c in chars(s) do
		table.insert(t, 1, c)
	end
	return table.concat(t)
end

-- Convert a Unicode code point to a UTF-8 byte sequence
-- Logic stolen from this page:
-- http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=iws-appendixa
--
-- Arguments:
--     Number representing the Unicode code point (e.g. 0x265c).
--
-- Returns:
--     UTF-8 encoded string of the given character.
--     Numbers out of range produce a blank string.
local function encode(code)
	if code < 0 then
		error('Code point must not be negative.')
	elseif code <= 0x7f then
		return string.char(code)
	elseif code <= 0x7ff then
		local c1 = code / 64 + 192
		local c2 = code % 64 + 128
		return string.char(c1, c2)
	elseif code <= 0xffff then
		local c1 = code / 4096 + 224
		local c2 = code % 4096 / 64 + 128
		local c3 = code % 64 + 128
		return string.char(c1, c2, c3)
	elseif code <= 0x10ffff then
		local c1 = code / 262144 + 240
		local c2 = code % 262144 / 4096 + 128
		local c3 = code % 4096 / 64 + 128
		local c4 = code % 64 + 128
		return string.char(c1, c2, c3, c4)
	end
	return ''
end

return {
	iter    = iter,
	chars   = chars,
	len     = len,
	sub     = sub,
	split   = split,
	reverse = reverse,
	encode  = encode
}