XenevaOS
Loading...
Searching...
No Matches
utf.h
Go to the documentation of this file.
1
32#ifndef __UTF_H__
33#define __UTF_H__
34
35#include <stdint.h>
36
37#define UTF8_1BYTE_CODEMAX 0x7F
38#define UTF8_2BYTE_CODEMAX 0x7FF
39#define UTF8_3BYTE_CODEMAX 0xFFFF
40#define UTF8_4BYTE_CODEMAX 0x10FFFF
41
42#define UTF8_1BYTE_MASK 0x80
43#define UTF8_2BYTE_MASK 0xE0
44#define UTF8_3BYTE_MASK 0xF0
45#define UTF8_4BYTE_MASK 0xF8
46#define UTF8_EXTBYTE_MASK 0xC0
47
48#define UTF8_1BYTE_PREFIX 0x00
49#define UTF8_2BYTE_PREFIX 0xC0
50#define UTF8_3BYTE_PREFIX 0xE0
51#define UTF8_4BYTE_PREFIX 0xF0
52#define UTF8_EXTBYTE_PREFIX 0x80
53
54#define UTF8_IS_1BYTE(ptr) \
55 ((ptr[0] & UTF8_1BYTE_MASK) == UTF8_1BYTE_PREFIX)
56
57#define UTF8_IS_2BYTE(ptr) \
58 (((ptr[0] & UTF8_2BYTE_MASK) == UTF8_2BYTE_PREFIX) && \
59 ((ptr[1] & UTF8_EXTBYTE_MASK) == UTF8_EXTBYTE_PREFIX))
60
61#define UTF8_IS_3BYTE(ptr) \
62 (((ptr[0] & UTF8_3BYTE_MASK) == UTF8_3BYTE_PREFIX) && \
63 (((ptr[1] | ptr[2]) & UTF8_EXTBYTE_MASK) == UTF8_EXTBYTE_PREFIX))
64
65#define UTF8_IS_4BYTE(ptr) \
66 (((ptr[0] & UTF8_4BYTE_MASK) == UTF8_4BYTE_PREFIX) && \
67 (((ptr[1] | ptr[2] | ptr[3]) & UTF8_EXTBYTE_MASK) == UTF8_EXTBYTE_PREFIX))
68
69static inline unsigned utf8CharToUnicode(const char *ptr, unsigned len)
70{
71 unsigned code = 0;
72
73 if ((len >= 1) && UTF8_IS_1BYTE(ptr))
74 {
75 code = (unsigned)(ptr[0] & (char)~UTF8_1BYTE_MASK);
76 }
77 else if ((len >= 2) && UTF8_IS_2BYTE(ptr))
78 {
79 code = (((unsigned)(ptr[0] & (char)~UTF8_2BYTE_MASK) << 6) |
80 (ptr[1] & (char)~UTF8_EXTBYTE_MASK));
81 }
82 else if ((len >= 3) && UTF8_IS_3BYTE(ptr))
83 {
84 code = (((unsigned)(ptr[0] & (char)~UTF8_3BYTE_MASK) << 12) |
85 ((unsigned)(ptr[1] & (char)~UTF8_EXTBYTE_MASK) << 6) |
86 (ptr[2] & (char)~UTF8_EXTBYTE_MASK));
87 }
88 else if ((len >= 4) && UTF8_IS_4BYTE(ptr))
89 {
90 code = (((unsigned)(ptr[0] & (char)~UTF8_4BYTE_MASK) << 18) |
91 ((unsigned)(ptr[1] & (char)~UTF8_EXTBYTE_MASK) << 12) |
92 ((unsigned)(ptr[2] & (char)~UTF8_EXTBYTE_MASK) << 6) |
93 (ptr[3] & (char)~UTF8_EXTBYTE_MASK));
94 }
95
96 return (code);
97}
98
99
100static inline unsigned utf8CodeWidth(unsigned code)
101{
102 if (code <= UTF8_1BYTE_CODEMAX)
103 return (1);
104 else if (code <= UTF8_2BYTE_CODEMAX)
105 return (2);
106 else if (code <= UTF8_3BYTE_CODEMAX)
107 return (3);
108 else if (code <= UTF8_4BYTE_CODEMAX)
109 return (4);
110 else
111 return (0);
112}
113
114
115static inline void unicodeToUtf8Char(unsigned code, unsigned char *ptr,
116 unsigned len)
117{
118 unsigned codeWidth = utf8CodeWidth(code);
119
120 if ((codeWidth == 1) && (len >= codeWidth))
121 {
122 ptr[0] = (code & UTF8_1BYTE_CODEMAX);
123 }
124 else if ((codeWidth == 2) && (len >= codeWidth))
125 {
126 ptr[0] = (UTF8_2BYTE_PREFIX | ((code & 0x07D0) >> 6));
127 ptr[1] = (UTF8_EXTBYTE_PREFIX | (code & 0x003F));
128 }
129 else if ((codeWidth == 3) && (len >= codeWidth))
130 {
131 ptr[0] = (UTF8_3BYTE_PREFIX | ((code & 0xF000) >> 12));
132 ptr[1] = (UTF8_EXTBYTE_PREFIX | ((code & 0x0FD0) >> 6));
133 ptr[2] = (UTF8_EXTBYTE_PREFIX | (code & 0x003F));
134 }
135 else if ((codeWidth == 4) && (len >= codeWidth))
136 {
137 ptr[0] = (UTF8_4BYTE_PREFIX | ((code & 0x001D0000) >> 18));
138 ptr[1] = (UTF8_EXTBYTE_PREFIX | ((code & 0x0003F000) >> 12));
139 ptr[2] = (UTF8_EXTBYTE_PREFIX | ((code & 0x00000FD0) >> 6));
140 ptr[3] = (UTF8_EXTBYTE_PREFIX | (code & 0x0000003F));
141 }
142}
143
144#define UTF8_ACCEPT 0
145#define UTF8_REJECT 1
146
147static uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
148 static int state_table[32] = {
149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xxxxxxx */
150 1, 1, 1, 1, 1, 1, 1, 1, /* 10xxxxxx */
151 2, 2, 2, 2, /* 110xxxxx */
152 3, 3, /* 1110xxxx */
153 4, /* 11110xxx */
154 1 /* 11111xxx */
155 };
156
157 static int mask_bytes[32] = {
158 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
159 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
160 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
161 0x1F, 0x1F, 0x1F, 0x1F,
162 0x0F, 0x0F,
163 0x07,
164 0x00
165 };
166
167 static int next[5] = {
168 0,
169 1,
170 0,
171 2,
172 3
173 };
174
175 if (*state == UTF8_ACCEPT) {
176 *codep = byte & mask_bytes[byte >> 3];
177 *state = state_table[byte >> 3];
178 }
179 else if (*state > 0) {
180 *codep = (byte & 0x3F) | (*codep << 6);
181 *state = next[*state];
182 }
183 return *state;
184 //return 0;
185}
186
187#endif
unsigned int uint32_t
Definition acefiex.h:163
#define UTF8_IS_4BYTE(ptr)
Definition utf.h:65
#define UTF8_3BYTE_PREFIX
Definition utf.h:50
#define UTF8_2BYTE_PREFIX
Definition utf.h:49
#define UTF8_EXTBYTE_MASK
Definition utf.h:46
#define UTF8_4BYTE_PREFIX
Definition utf.h:51
#define UTF8_3BYTE_CODEMAX
Definition utf.h:39
#define UTF8_IS_2BYTE(ptr)
Definition utf.h:57
#define UTF8_1BYTE_CODEMAX
Definition utf.h:37
#define UTF8_ACCEPT
Definition utf.h:144
#define UTF8_IS_3BYTE(ptr)
Definition utf.h:61
#define UTF8_IS_1BYTE(ptr)
Definition utf.h:54
#define UTF8_4BYTE_CODEMAX
Definition utf.h:40
#define UTF8_EXTBYTE_PREFIX
Definition utf.h:52
#define UTF8_2BYTE_CODEMAX
Definition utf.h:38