added stb, more binaryout changes"
[henge/apc.git] / stb / tools / unicode.c
1 #define STB_DEFINE
2 #include "../stb.h"
3
4 // create unicode mappings
5 //
6 // Two kinds of mappings:
7 // map to a number
8 // map to a bit
9 //
10 // For mapping to a number, we use the following strategy:
11 //
12 // User supplies:
13 // 1. a table of numbers (for now we use uint16, so full Unicode table is 4MB)
14 // 2. a "don't care" value
15 // 3. define a 'fallback' value (typically 0)
16 // 4. define a fast-path range (typically 0..255 or 0..1023) [@TODO: automate detecting this]
17 //
18 // Code:
19 // 1. Determine range of *end* of unicode codepoints (U+10FFFF and down) which
20 // all have the same value (or don't care). If large enough, emit this as a
21 // special case in the code.
22 // 2. Repeat above, limited to at most U+FFFF.
23 // 3. Cluster the data into intervals of 8,16,32,64,128,256 numeric values.
24 // 3a. If all the values in an interval are fallback/dont-care, no further processing
25 // 3b. Find the "trimmed range" outside which all the values are the fallback or don't care
26 // 3c. Find the "special trimmed range" outside which all the values are some constant or don't care
27 // 4. Pack the clusters into continuous memory, and find previous instances of
28 // the cluster. Repeat for trimmed & special-trimmed. In the first case, find
29 // previous instances of the cluster (allow don't-care to match in either
30 // direction), both aligned and mis-aligned; in the latter, starting where
31 // things start or mis-aligned. Build an index table specifiying the
32 // location of each cluster (and its length). Allow an extra indirection here;
33 // the full-sized index can index a smaller table which has the actual offset
34 // (and lengths).
35 // 5. Associate with each packed continuous memory above the amount of memory
36 // required to store the data w/ smallest datatype (of uint8, uint16, uint32).
37 // Discard the continuous memory. Recurse on each index table, but avoid the
38 // smaller packing.
39 //
40 // For mapping to a bit, we pack the results for 8 characters into a byte, and then apply
41 // the above strategy. Note that there may be more optimal approaches with e.g. packing
42 // 8 different bits into a single structure, though, which we should explore eventually.
43
44
45 // currently we limit *indices* to being 2^16, and we pack them as
46 // index + end_trim*2^16 + start_trim*2^24; specials have to go in a separate table
47 typedef uint32 uval;
48 #define UVAL_DONT_CARE_DEFAULT 0xffffffff
49
50 typedef struct
51 {
52 uval *input;
53 uint32 dont_care;
54 uint32 fallback;
55 int fastpath;
56 int length;
57 int depth;
58 int has_sign;
59 int splittable;
60 int replace_fallback_with_codepoint;
61 size_t input_size;
62 size_t inherited_storage;
63 } table;
64
65 typedef struct
66 {
67 int split_log2;
68 table result; // index into not-returned table
69 int storage;
70 } output;
71
72 typedef struct
73 {
74 table t;
75 char **output_name;
76 } info;
77
78 typedef struct
79 {
80 size_t path;
81 size_t size;
82 } result;
83
84 typedef struct
85 {
86 uint8 trim_end;
87 uint8 trim_start;
88 uint8 special;
89 uint8 aligned;
90 uint8 indirect;
91
92 uint16 overhead; // add some forced overhead for each mode to avoid getting complex encoding when it doesn't save much
93
94 } mode_info;
95
96 mode_info modes[] =
97 {
98 { 0,0,0,0,0, 32, },
99 { 0,0,0,0,1, 100, },
100 { 0,0,0,1,0, 32, },
101 { 0,0,0,1,1, 100, },
102 { 0,0,1,0,1, 100, },
103 { 0,0,1,1,0, 32, },
104 { 0,0,1,1,1, 200, },
105 { 1,0,0,0,0, 100, },
106 { 1,0,0,0,1, 120, },
107 { 1,1,0,0,0, 100, },
108 { 1,1,0,0,1, 130, },
109 { 1,0,1,0,0, 130, },
110 { 1,0,1,0,1, 180, },
111 { 1,1,1,0,0, 180, },
112 { 1,1,1,0,1, 200, },
113 };
114
115 #define MODECOUNT (sizeof(modes)/sizeof(modes[0]))
116 #define CLUSTERSIZECOUNT 6 // 8,16, 32,64, 128,256
117
118 size_t size_for_max_number(uint32 number)
119 {
120 if (number == 0) return 0;
121 if (number < 256) return 1;
122 if (number < 256*256) return 2;
123 if (number < 256*256*256) return 3;
124 return 4;
125 }
126
127 size_t size_for_max_number_aligned(uint32 number)
128 {
129 size_t n = size_for_max_number(number);
130 return n == 3 ? 4 : n;
131 }
132
133 uval get_data(uval *data, int offset, uval *end)
134 {
135 if (data + offset >= end)
136 return 0;
137 else
138 return data[offset];
139 }
140
141 int safe_len(uval *data, int len, uval *end)
142 {
143 if (len > end - data)
144 return end - data;
145 return len;
146 }
147
148 uval tempdata[256];
149 int dirty=0;
150
151 size_t find_packed(uval **packed, uval *data, int len, int aligned, int fastpath, uval *end, int offset, int replace)
152 {
153 int packlen = stb_arr_len(*packed);
154 int i,p;
155
156 if (data+len > end || replace) {
157 int safelen = safe_len(data, len, end);
158 memset(tempdata, 0, dirty*sizeof(tempdata[0]));
159 memcpy(tempdata, data, safelen * sizeof(data[0]));
160 data = tempdata;
161 dirty = len;
162 }
163 if (replace) {
164 int i;
165 int safelen = safe_len(data, len, end);
166 for (i=0; i < safelen; ++i)
167 if (data[i] == 0)
168 data[i] = offset+i;
169 }
170
171 if (len <= 0)
172 return 0;
173 if (!fastpath) {
174 if (aligned) {
175 for (i=0; i < packlen; i += len)
176 if ((*packed)[i] == data[0] && 0==memcmp(&(*packed)[i], data, len * sizeof(uval)))
177 return i / len;
178 } else {
179 for (i=0; i < packlen-len+1; i += 1 )
180 if ((*packed)[i] == data[0] && 0==memcmp(&(*packed)[i], data, len * sizeof(uval)))
181 return i;
182 }
183 }
184 p = stb_arr_len(*packed);
185 for (i=0; i < len; ++i)
186 stb_arr_push(*packed, data[i]);
187 return p;
188 }
189
190 void output_table(char *name1, char *name2, uval *data, int length, int sign, char **names)
191 {
192 char temp[20];
193 uval maxv = 0;
194 int bytes, numlen, at_newline;
195 int linelen = 79; // @TODO: make table more readable by choosing a length that's a multiple?
196 int i,pos, do_split=0;
197 for (i=0; i < length; ++i)
198 if (sign)
199 maxv = stb_max(maxv, (uval)abs((int)data[i]));
200 else
201 maxv = stb_max(maxv, data[i]);
202 bytes = size_for_max_number_aligned(maxv);
203 sprintf(temp, "%d", maxv);
204 numlen=strlen(temp);
205 if (sign)
206 ++numlen;
207
208 if (bytes == 0)
209 return;
210
211 printf("uint%d %s%s[%d] = {\n", bytes*8, name1, name2, length);
212 at_newline = 1;
213 for (i=0; i < length; ++i) {
214 if (pos + numlen + 2 > linelen) {
215 printf("\n");
216 at_newline = 1;
217 pos = 0;
218 }
219 if (at_newline) {
220 printf(" ");
221 pos = 2;
222 at_newline = 0;
223 } else {
224 printf(" ");
225 ++pos;
226 }
227 printf("%*d,", numlen, data[i]);
228 pos += numlen+1;
229 }
230 if (!at_newline) printf("\n");
231 printf("};\n");
232 }
233
234 void output_table_with_trims(char *name1, char *name2, uval *data, int length)
235 {
236 uval maxt=0, maxp=0;
237 int i,d,s,e, count;
238 // split the table into two pieces
239 uval *trims = NULL;
240
241 if (length == 0)
242 return;
243
244 for (i=0; i < stb_arr_len(data); ++i) {
245 stb_arr_push(trims, data[i] >> 16);
246 data[i] &= 0xffff;
247 maxt = stb_max(maxt, trims[i]);
248 maxp = stb_max(maxp, data[i]);
249 }
250
251 d=s=e=1;
252 if (maxt >= 256) {
253 // need to output start & end values
254 if (maxp >= 256) {
255 // can pack into a single table
256 printf("struct { uint16 val; uint8 start, end; } %s%s[%d] = {\n", name1, name2, length);
257 } else {
258 output_table(name1, name2, data, length, 0, 0);
259 d=0;
260 printf("struct { uint8 start, end; } %s%s_trim[%d] = {\n", name1, name2, length);
261 }
262 } else if (maxt > 0) {
263 if (maxp >= 256) {
264 output_table(name1, name2, data, length, 0, 0);
265 output_table(name1, stb_sprintf("%s_end", name2), trims, length, 0, 0);
266 return;
267 } else {
268 printf("struct { uint8 val, end; } %s%s[%d] = {\n", name1, name2, length);
269 s=0;
270 }
271 } else {
272 output_table(name1, name2, data, length, 0, 0);
273 return;
274 }
275 // d or s can be zero (but not both), e is always present and last
276 count = d + s + e;
277 assert(count >= 2 && count <= 3);
278
279 {
280 char temp[60];
281 uval maxv = 0;
282 int numlen, at_newline, len;
283 int linelen = 79; // @TODO: make table more readable by choosing a length that's a multiple?
284 int i,pos, do_split=0;
285 numlen = 0;
286 for (i=0; i < length; ++i) {
287 if (count == 2)
288 sprintf(temp, "{%d,%d}", d ? data[i] : (trims[i]>>8), trims[i]&255);
289 else
290 sprintf(temp, "{%d,%d,%d}", data[i], trims[i]>>8, trims[i]&255);
291 len = strlen(temp);
292 numlen = stb_max(len, numlen);
293 }
294
295 at_newline = 1;
296 for (i=0; i < length; ++i) {
297 if (pos + numlen + 2 > linelen) {
298 printf("\n");
299 at_newline = 1;
300 pos = 0;
301 }
302 if (at_newline) {
303 printf(" ");
304 pos = 2;
305 at_newline = 0;
306 } else {
307 printf(" ");
308 ++pos;
309 }
310 if (count == 2)
311 sprintf(temp, "{%d,%d}", d ? data[i] : (trims[i]>>8), trims[i]&255);
312 else
313 sprintf(temp, "{%d,%d,%d}", data[i], trims[i]>>8, trims[i]&255);
314 printf("%*s,", numlen, temp);
315 pos += numlen+1;
316 }
317 if (!at_newline) printf("\n");
318 printf("};\n");
319 }
320 }
321
322 int weight=1;
323
324 table pack_for_mode(table *t, int mode, char *table_name)
325 {
326 size_t extra_size;
327 int i;
328 uval maxv;
329 mode_info mi = modes[mode % MODECOUNT];
330 int size = 8 << (mode / MODECOUNT);
331 table newtab;
332 uval *packed = NULL;
333 uval *index = NULL;
334 uval *indirect = NULL;
335 uval *specials = NULL;
336 newtab.dont_care = UVAL_DONT_CARE_DEFAULT;
337 if (table_name)
338 printf("// clusters of %d\n", size);
339 for (i=0; i < t->length; i += size) {
340 uval newval;
341 int fastpath = (i < t->fastpath);
342 if (mi.special) {
343 int end_trim = size-1;
344 int start_trim = 0;
345 uval special;
346 // @TODO: pick special from start or end instead of only end depending on which is longer
347 for(;;) {
348 special = t->input[i + end_trim];
349 if (special != t->dont_care || end_trim == 0)
350 break;
351 --end_trim;
352 }
353 // at this point, special==inp[end_trim], and end_trim >= 0
354 if (special == t->dont_care && !fastpath) {
355 // entire block is don't care, so OUTPUT don't care
356 stb_arr_push(index, newtab.dont_care);
357 continue;
358 } else {
359 uval pos, trim;
360 if (mi.trim_end && !fastpath) {
361 while (end_trim >= 0) {
362 if (t->input[i + end_trim] == special || t->input[i + end_trim] == t->dont_care)
363 --end_trim;
364 else
365 break;
366 }
367 }
368
369 if (mi.trim_start && !fastpath) {
370 while (start_trim < end_trim) {
371 if (t->input[i + start_trim] == special || t->input[i + start_trim] == t->dont_care)
372 ++start_trim;
373 else
374 break;
375 }
376 }
377
378 // end_trim points to the last character we have to output
379
380 // find the first match, or add it
381 pos = find_packed(&packed, &t->input[i+start_trim], end_trim-start_trim+1, mi.aligned, fastpath, &t->input[t->length], i+start_trim, t->replace_fallback_with_codepoint);
382
383 // encode as a uval
384 if (!mi.trim_end) {
385 if (end_trim == 0)
386 pos = special;
387 else
388 pos = pos | 0x80000000;
389 } else {
390 assert(end_trim < size && end_trim >= -1);
391 if (!fastpath) assert(end_trim < size-1); // special always matches last one
392 assert(end_trim < size && end_trim+1 >= 0);
393 if (!fastpath) assert(end_trim+1 < size);
394
395 if (mi.trim_start)
396 trim = start_trim*256 + (end_trim+1);
397 else
398 trim = end_trim+1;
399
400 assert(pos < 65536); // @TODO: if this triggers, just bail on this search path
401 pos = pos + (trim << 16);
402 }
403
404 newval = pos;
405
406 stb_arr_push(specials, special);
407 }
408 } else if (mi.trim_end) {
409 int end_trim = size-1;
410 int start_trim = 0;
411 uval pos, trim;
412
413 while (end_trim >= 0 && !fastpath)
414 if (t->input[i + end_trim] == t->fallback || t->input[i + end_trim] == t->dont_care)
415 --end_trim;
416 else
417 break;
418
419 if (mi.trim_start && !fastpath) {
420 while (start_trim < end_trim) {
421 if (t->input[i + start_trim] == t->fallback || t->input[i + start_trim] == t->dont_care)
422 ++start_trim;
423 else
424 break;
425 }
426 }
427
428 // end_trim points to the last character we have to output, and can be -1
429 ++end_trim; // make exclusive at end
430
431 if (end_trim == 0 && size == 256)
432 start_trim = end_trim = 1; // we can't make encode a length from 0..256 in 8 bits, so restrict end_trim to 1..256
433
434 // find the first match, or add it
435 pos = find_packed(&packed, &t->input[i+start_trim], end_trim - start_trim, mi.aligned, fastpath, &t->input[t->length], i+start_trim, t->replace_fallback_with_codepoint);
436
437 assert(end_trim <= size && end_trim >= 0);
438 if (size == 256)
439 assert(end_trim-1 < 256 && end_trim-1 >= 0);
440 else
441 assert(end_trim < 256 && end_trim >= 0);
442 if (size == 256)
443 --end_trim;
444
445 if (mi.trim_start)
446 trim = start_trim*256 + end_trim;
447 else
448 trim = end_trim;
449
450 assert(pos < 65536); // @TODO: if this triggers, just bail on this search path
451 pos = pos + (trim << 16);
452
453 newval = pos;
454 } else {
455 newval = find_packed(&packed, &t->input[i], size, mi.aligned, fastpath, &t->input[t->length], i, t->replace_fallback_with_codepoint);
456 }
457
458 if (mi.indirect) {
459 int j;
460 for (j=0; j < stb_arr_len(indirect); ++j)
461 if (indirect[j] == newval)
462 break;
463 if (j == stb_arr_len(indirect))
464 stb_arr_push(indirect, newval);
465 stb_arr_push(index, j);
466 } else {
467 stb_arr_push(index, newval);
468 }
469 }
470
471 // total up the new size for everything but the index table
472 extra_size = mi.overhead * weight; // not the actual overhead cost; a penalty to avoid excessive complexity
473 extra_size += 150; // per indirection
474 if (table_name)
475 extra_size = 0;
476
477 if (t->has_sign) {
478 // 'packed' contains two values, which should be packed positive & negative for size
479 uval maxv2;
480 for (i=0; i < stb_arr_len(packed); ++i)
481 if (packed[i] & 0x80000000)
482 maxv2 = stb_max(maxv2, packed[i]);
483 else
484 maxv = stb_max(maxv, packed[i]);
485 maxv = stb_max(maxv, maxv2) << 1;
486 } else {
487 maxv = 0;
488 for (i=0; i < stb_arr_len(packed); ++i)
489 if (packed[i] > maxv && packed[i] != t->dont_care)
490 maxv = packed[i];
491 }
492 extra_size += stb_arr_len(packed) * (t->splittable ? size_for_max_number(maxv) : size_for_max_number_aligned(maxv));
493 if (table_name) {
494 if (t->splittable)
495 output_table_with_trims(table_name, "", packed, stb_arr_len(packed));
496 else
497 output_table(table_name, "", packed, stb_arr_len(packed), t->has_sign, NULL);
498 }
499
500 maxv = 0;
501 for (i=0; i < stb_arr_len(specials); ++i)
502 if (specials[i] > maxv)
503 maxv = specials[i];
504 extra_size += stb_arr_len(specials) * size_for_max_number_aligned(maxv);
505 if (table_name)
506 output_table(table_name, "_default", specials, stb_arr_len(specials), 0, NULL);
507
508 maxv = 0;
509 for (i=0; i < stb_arr_len(indirect); ++i)
510 if (indirect[i] > maxv)
511 maxv = indirect[i];
512 extra_size += stb_arr_len(indirect) * size_for_max_number(maxv);
513
514 if (table_name && stb_arr_len(indirect)) {
515 if (mi.trim_end)
516 output_table_with_trims(table_name, "_index", indirect, stb_arr_len(indirect));
517 else {
518 assert(0); // this case should only trigger in very extreme circumstances
519 output_table(table_name, "_index", indirect, stb_arr_len(indirect), 0, NULL);
520 }
521 mi.trim_end = mi.special = 0;
522 }
523
524 if (table_name)
525 printf("// above tables should be %d bytes\n", extra_size);
526
527 maxv = 0;
528 for (i=0; i < stb_arr_len(index); ++i)
529 if (index[i] > maxv && index[i] != t->dont_care)
530 maxv = index[i];
531 newtab.splittable = mi.trim_end;
532 newtab.input_size = newtab.splittable ? size_for_max_number(maxv) : size_for_max_number_aligned(maxv);
533 newtab.input = index;
534 newtab.length = stb_arr_len(index);
535 newtab.inherited_storage = t->inherited_storage + extra_size;
536 newtab.fastpath = 0;
537 newtab.depth = t->depth+1;
538 stb_arr_free(indirect);
539 stb_arr_free(packed);
540 stb_arr_free(specials);
541
542 return newtab;
543 }
544
545 result pack_table(table *t, size_t path, int min_storage)
546 {
547 int i;
548 result best;
549 best.size = t->inherited_storage + t->input_size * t->length;
550 best.path = path;
551
552 if ((int) t->inherited_storage > min_storage) {
553 best.size = stb_max(best.size, t->inherited_storage);
554 return best;
555 }
556
557 if (t->length <= 256 || t->depth >= 4) {
558 //printf("%08x: %7d\n", best.path, best.size);
559 return best;
560 }
561
562 path <<= 7;
563 for (i=0; i < MODECOUNT * CLUSTERSIZECOUNT; ++i) {
564 table newtab;
565 result r;
566 newtab = pack_for_mode(t, i, 0);
567 r = pack_table(&newtab, path+i+1, min_storage);
568 if (r.size < best.size)
569 best = r;
570 stb_arr_free(newtab.input);
571 //printf("Size: %6d + %6d\n", newtab.inherited_storage, newtab.input_size * newtab.length);
572 }
573 return best;
574 }
575
576 int pack_table_by_modes(table *t, int *modes)
577 {
578 table s = *t;
579 while (*modes > -1) {
580 table newtab;
581 newtab = pack_for_mode(&s, *modes, 0);
582 if (s.input != t->input)
583 stb_arr_free(s.input);
584 s = newtab;
585 ++modes;
586 }
587 return s.inherited_storage + s.input_size * s.length;
588 }
589
590 int strip_table(table *t, int exceptions)
591 {
592 uval terminal_value;
593 int p = t->length-1;
594 while (t->input[p] == t->dont_care)
595 --p;
596 terminal_value = t->input[p];
597
598 while (p >= 0x10000) {
599 if (t->input[p] != terminal_value && t->input[p] != t->dont_care) {
600 if (exceptions)
601 --exceptions;
602 else
603 break;
604 }
605 --p;
606 }
607 return p+1; // p is a character we must output
608 }
609
610 void optimize_table(table *t, char *table_name)
611 {
612 int modelist[3] = { 85, -1 };
613 int modes[8];
614 int num_modes = 0;
615 int decent_size;
616 result r;
617 size_t path;
618 table s;
619
620 // strip tail end of table
621 int orig_length = t->length;
622 int threshhold = 0xffff;
623 int p = strip_table(t, 2);
624 int len_saved = t->length - p;
625 if (len_saved >= threshhold) {
626 t->length = p;
627 while (p > 0x10000) {
628 p = strip_table(t, 0);
629 len_saved = t->length - p;
630 if (len_saved < 0x10000)
631 break;
632 len_saved = orig_length - p;
633 if (len_saved < threshhold)
634 break;
635 threshhold *= 2;
636 }
637 }
638
639 t->depth = 1;
640
641
642 // find size of table if we use path 86
643 decent_size = pack_table_by_modes(t, modelist);
644
645
646 #if 1
647 // find best packing of remainder of table by exploring tree of packings
648 r = pack_table(t, 0, decent_size);
649 // use the computed 'path' to evaluate and output tree
650 path = r.path;
651 #else
652 path = 86;//90;//132097;
653 #endif
654
655 while (path) {
656 modes[num_modes++] = (path & 127) - 1;
657 path >>= 7;
658 }
659
660 printf("// modes: %d\n", r.path);
661 s = *t;
662 while (num_modes > 0) {
663 char name[256];
664 sprintf(name, "%s_%d", table_name, num_modes+1);
665 --num_modes;
666 s = pack_for_mode(&s, modes[num_modes], name);
667 }
668 // output the final table as-is
669 if (s.splittable)
670 output_table_with_trims(table_name, "_1", s.input, s.length);
671 else
672 output_table(table_name, "_1", s.input, s.length, 0, NULL);
673 }
674
675 uval unicode_table[0x110000];
676
677 typedef struct
678 {
679 uval lo,hi;
680 } char_range;
681
682 char_range get_range(char *str)
683 {
684 char_range cr;
685 char *p;
686 cr.lo = strtol(str, &p, 16);
687 p = stb_skipwhite(p);
688 if (*p == '.')
689 cr.hi = strtol(p+2, NULL, 16);
690 else
691 cr.hi = cr.lo;
692 return cr;
693 }
694
695 char *skip_semi(char *s, int count)
696 {
697 while (count) {
698 s = strchr(s, ';');
699 assert(s != NULL);
700 ++s;
701 --count;
702 }
703 return s;
704 }
705
706 int main(int argc, char **argv)
707 {
708 table t;
709 uval maxv=0;
710 int i,n=0;
711 char **s = stb_stringfile("../../data/UnicodeData.txt", &n);
712 assert(s);
713 for (i=0; i < n; ++i) {
714 if (s[i][0] == '#' || s[i][0] == '\n' || s[i][0] == 0)
715 ;
716 else {
717 char_range cr = get_range(s[i]);
718 char *t = skip_semi(s[i], 13);
719 uval j, v;
720 if (*t == ';' || *t == '\n' || *t == 0)
721 v = 0;
722 else {
723 v = strtol(t, NULL, 16);
724 if (v < 65536) {
725 maxv = stb_max(v, maxv);
726 for (j=cr.lo; j <= cr.hi; ++j) {
727 unicode_table[j] = v;
728 //printf("%06x => %06x\n", j, v);
729 }
730 }
731 }
732 }
733 }
734
735 t.depth = 0;
736 t.dont_care = UVAL_DONT_CARE_DEFAULT;
737 t.fallback = 0;
738 t.fastpath = 256;
739 t.inherited_storage = 0;
740 t.has_sign = 0;
741 t.splittable = 0;
742 t.input = unicode_table;
743 t.input_size = size_for_max_number(maxv);
744 t.length = 0x110000;
745 t.replace_fallback_with_codepoint = 1;
746
747 optimize_table(&t, "stbu_upppercase");
748 return 0;
749 }