--[[---------------
Utf8 v0.4
-------------------
utf8 -> unicode ucs2 converter
How to use:
to convert:
ucs2_string = utf8.utf_to_uni(utf8_string)
to view a string in hex:
utf8.print_hex(str)
Under the MIT license.
Utf8 is a part of LuaBit Project(http://luaforge.net/projects/bit/).
copyright(c) 2007 hanzhao (abrash_han@hotmail.com)
--]]---------------
require 'hex'
require 'bit'
do
local BYTE_1_HEAD = hex.to_dec('0x00') -- 0### ####
local BYTE_2_HEAD = hex.to_dec('0xC0') -- 110# ####
local BYTE_3_HEAD = hex.to_dec('0xE0') -- 1110 ####
-- mask to get the head
local BYTE_1_MASK = hex.to_dec('0x80') -- 1### ####
local BYTE_2_MASK = hex.to_dec('0xE0') -- 111# ####
local BYTE_3_MASK = hex.to_dec('0xF0') -- 1111 ####
-- tail byte mask
local TAIL_MASK = hex.to_dec('0x3F') -- 10## ####
local mask_tbl = {
BYTE_3_MASK,
BYTE_2_MASK,
BYTE_1_MASK,
}
local head_tbl = {
BYTE_3_HEAD,
BYTE_2_HEAD,
BYTE_1_HEAD,
}
local len_tbl = {
[BYTE_1_HEAD] = 1,
[BYTE_2_HEAD] = 2,
[BYTE_3_HEAD] = 3,
}
local function utf_read_char(utf, start)
local head_byte = string.byte(utf, start)
--print('head byte ' .. hex.to_hex(head_byte))
for m = 1, table.getn(mask_tbl) do
local mask = mask_tbl[m]
-- head match
local head = bit.band(head_byte, mask)
--print('head ' .. hex.to_hex(head) .. ' ' .. hex.to_hex(mask))
if(head == head_tbl[m]) then
local len = len_tbl[head_tbl[m]]
--print('len ' .. len)
local tail_idx = start + len - 1
local char = 0
-- tail
for i = tail_idx, start + 1, -1 do
local tail_byte = string.byte(utf, i)
local byte = bit.band(tail_byte, TAIL_MASK)
--print('byte ' .. hex.to_hex(byte).. ' = ' .. hex.to_hex(tail_byte) .. '&'..hex.to_hex(TAIL_MASK))
if(tail_idx - i > 0) then
local sft = bit.blshift(byte, (tail_idx - i) * 6)
--print('shift ' .. hex.to_hex(sft) .. ' ' .. hex.to_hex(byte) .. ' ' .. ((tail_idx - i) * 6))
char = bit.bor(char, sft)
--print('char ' .. hex.to_hex(char))
else
char = byte
end
end -- tails
-- add head
local head_val = bit.band(head_byte, bit.bnot(mask))
--print('head val ' .. hex.to_hex(head_val))
head_val = bit.blshift(head_val, (len-1) * 6)
--print('head val ' .. hex.to_hex(head_val))
char = bit.bor(head_val, char)
--print('char ' .. hex.to_hex(char))
return char, len
end -- if head match
end -- for mask
error('not find proper head mask')
end
local function print_hex(str)
local cat = ''
for i=1, string.len(str) do
cat = cat .. ' ' .. hex.to_hex(string.byte(str, i))
end
print(cat)
end
local HI_MASK = hex.to_dec('0xF0')
local LO_MASK = hex.to_dec('0xFF')
local function char_to_str(char)
local hi, lo = bit.brshift(char, 8), bit.band(char, LO_MASK)
-- print(hex.to_hex(char)..' '..hex.to_hex(hi)..' ' .. hex.to_hex(lo))
if(hi == 0) then
return string.format('%c\0', lo)
elseif(lo == 0) then
return string.format('\0%c', hi)
else
return string.format('%c%c', lo, hi)
end
end
local function utf_to_uni(utf)
local n = string.len(utf)
local i = 1
local uni = ''
while(i <= n) do
--print('---')
char, len = utf_read_char(utf, i)
i = i + len
--print(string.len(char_to_str(char)))
uni = uni..char_to_str(char)
end
--print_hex(uni)
return uni
end
-- interface
utf8 = {
utf_to_uni = utf_to_uni,
print_hex = print_hex,
}
end
--[[
-- test
byte_3 = string.format('%c%c%c', hex.to_dec('0xE7'), hex.to_dec('0x83'), hex.to_dec('0xad'))
print(string.len(byte_3))
utf8.utf_to_uni(byte_3)
--]]
--[[
byte_2 = string.format('%c%c', hex.to_dec('0xC2'), hex.to_dec('0x9D'))
utf8.utf_to_uni(byte_2)
byte_1 = string.format('%c', hex.to_dec('0xB'))
utf8.utf_to_uni(byte_1)
--]]
--[[
test_mul = string.format(
'%c%c%c%c%c%c%c%c%c',
hex.to_dec('0xE8'),hex.to_dec('0xAF'), hex.to_dec('0xBA'),
hex.to_dec('0xE5'),hex.to_dec('0x9F'), hex.to_dec('0xBA'),
hex.to_dec('0xE4'),hex.to_dec('0xBA'), hex.to_dec('0x9A'))
utf8.print_hex(utf8.utf_to_uni(test_mul))
--]]