lotrointerface.com
Search Downloads

LoTROInterface SVN SequenceBars

[/] [trunk/] [Thurallor/] [Common/] [Utils/] [LibCompress/] [luabit-0.4/] [utf8.lua] - Rev 2

Compare with Previous | Blame | View Log

--[[---------------
Utf8 v0.4
-------------------
utf8 -> unicode ucs2 converter

How to use:
to convert:
 ucs2_string = utf8.utf_to_uni(utf8_string)
 
to view a string in hex:
 utf8.print_hex(str)

Under the MIT license.

Utf8 is a part of LuaBit Project(http://luaforge.net/projects/bit/).

copyright(c) 2007 hanzhao (abrash_han@hotmail.com)
--]]---------------

require 'hex'
require 'bit'

do
 local BYTE_1_HEAD = hex.to_dec('0x00') -- 0### ####
 local BYTE_2_HEAD = hex.to_dec('0xC0') -- 110# ####
 local BYTE_3_HEAD = hex.to_dec('0xE0') -- 1110 ####

 -- mask to get the head
 local BYTE_1_MASK = hex.to_dec('0x80') -- 1### ####
 local BYTE_2_MASK = hex.to_dec('0xE0') -- 111# ####
 local BYTE_3_MASK = hex.to_dec('0xF0') -- 1111 ####
 
 -- tail byte mask
 local TAIL_MASK = hex.to_dec('0x3F') -- 10## ####

 local mask_tbl = {
  BYTE_3_MASK,
  BYTE_2_MASK,
  BYTE_1_MASK,
 }
 local head_tbl = {
  BYTE_3_HEAD,
  BYTE_2_HEAD,
  BYTE_1_HEAD,
 }
 
 local len_tbl = {
  [BYTE_1_HEAD] = 1,
  [BYTE_2_HEAD] = 2,
  [BYTE_3_HEAD] = 3,
 }

 local function utf_read_char(utf, start)
  local head_byte = string.byte(utf, start)
  --print('head byte ' .. hex.to_hex(head_byte))
  for m = 1, table.getn(mask_tbl) do
   local mask = mask_tbl[m]
   -- head match
   local head = bit.band(head_byte, mask)
   --print('head ' .. hex.to_hex(head) .. ' ' .. hex.to_hex(mask))
   if(head == head_tbl[m]) then
    local len = len_tbl[head_tbl[m]]
    --print('len ' .. len)
    local tail_idx = start + len - 1
    local char = 0
    -- tail
    for i = tail_idx, start + 1, -1 do
     local tail_byte = string.byte(utf, i)
     local byte = bit.band(tail_byte, TAIL_MASK)
     --print('byte ' .. hex.to_hex(byte).. ' = ' .. hex.to_hex(tail_byte) .. '&'..hex.to_hex(TAIL_MASK))
     if(tail_idx - i > 0) then
      local sft = bit.blshift(byte, (tail_idx - i) * 6)
      --print('shift ' .. hex.to_hex(sft) .. ' ' .. hex.to_hex(byte) .. ' ' .. ((tail_idx - i) * 6))
      char = bit.bor(char, sft)
      --print('char ' .. hex.to_hex(char))
     else
      char = byte
     end
    end -- tails
    
    -- add head
    local head_val = bit.band(head_byte, bit.bnot(mask))
    --print('head val ' .. hex.to_hex(head_val))
    head_val = bit.blshift(head_val, (len-1) * 6)
    --print('head val ' .. hex.to_hex(head_val))
    char = bit.bor(head_val, char)
    --print('char ' .. hex.to_hex(char))
    
    return char, len
   end -- if head match
  end -- for mask
  error('not find proper head mask')
 end
 
 local function print_hex(str)
  local cat = ''
  for i=1, string.len(str) do
   cat = cat .. ' ' .. hex.to_hex(string.byte(str, i))
  end
  print(cat)
 end

 local HI_MASK = hex.to_dec('0xF0')
 local LO_MASK = hex.to_dec('0xFF')
 
 local function char_to_str(char)
  local hi, lo = bit.brshift(char, 8), bit.band(char, LO_MASK)
  -- print(hex.to_hex(char)..' '..hex.to_hex(hi)..' ' .. hex.to_hex(lo))
  if(hi == 0) then
   return string.format('%c\0', lo)
  elseif(lo == 0) then
   return string.format('\0%c', hi)
  else
   return string.format('%c%c', lo, hi)
  end
 end
 
 local function utf_to_uni(utf)
  local n = string.len(utf)
  local i = 1
  local uni = ''
  while(i <= n) do
   --print('---')
   char, len = utf_read_char(utf, i)
   i = i + len
   --print(string.len(char_to_str(char)))
   
   uni = uni..char_to_str(char)
  end
  --print_hex(uni)
  return uni
 end

 -- interface
 utf8 = {
  utf_to_uni = utf_to_uni,
  print_hex = print_hex,
 }

end

--[[
-- test
byte_3 = string.format('%c%c%c', hex.to_dec('0xE7'), hex.to_dec('0x83'), hex.to_dec('0xad'))
print(string.len(byte_3))
utf8.utf_to_uni(byte_3)
--]]
--[[
byte_2 = string.format('%c%c', hex.to_dec('0xC2'), hex.to_dec('0x9D'))
utf8.utf_to_uni(byte_2)

byte_1 = string.format('%c', hex.to_dec('0xB'))
utf8.utf_to_uni(byte_1)
--]]
--[[
test_mul = string.format(
'%c%c%c%c%c%c%c%c%c',
hex.to_dec('0xE8'),hex.to_dec('0xAF'), hex.to_dec('0xBA'),
hex.to_dec('0xE5'),hex.to_dec('0x9F'), hex.to_dec('0xBA'),
hex.to_dec('0xE4'),hex.to_dec('0xBA'), hex.to_dec('0x9A'))

utf8.print_hex(utf8.utf_to_uni(test_mul))
--]]

Compare with Previous | Blame


All times are GMT -5. The time now is 11:27 PM.


Our Network
EQInterface | EQ2Interface | Minion | WoWInterface | ESOUI | LoTROInterface | MMOUI | Swtorui