comments updated
[henge/apc.git] / stb / stb_dxt.h
1 // stb_dxt.h - v1.04 - DXT1/DXT5 compressor - public domain
2 // original by fabian "ryg" giesen - ported to C by stb
3 // use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation
4 //
5 // USAGE:
6 // call stb_compress_dxt_block() for every block (you must pad)
7 // source should be a 4x4 block of RGBA data in row-major order;
8 // A is ignored if you specify alpha=0; you can turn on dithering
9 // and "high quality" using mode.
10 //
11 // version history:
12 // v1.04 - (ryg) default to no rounding bias for lerped colors (as per S3TC/DX10 spec);
13 // single color match fix (allow for inexact color interpolation);
14 // optimal DXT5 index finder; "high quality" mode that runs multiple refinement steps.
15 // v1.03 - (stb) endianness support
16 // v1.02 - (stb) fix alpha encoding bug
17 // v1.01 - (stb) fix bug converting to RGB that messed up quality, thanks ryg & cbloom
18 // v1.00 - (stb) first release
19 //
20 // LICENSE
21 //
22 // This software is dual-licensed to the public domain and under the following
23 // license: you are granted a perpetual, irrevocable license to copy, modify,
24 // publish, and distribute this file as you see fit.
25
26 #ifndef STB_INCLUDE_STB_DXT_H
27 #define STB_INCLUDE_STB_DXT_H
28
29 // compression mode (bitflags)
30 #define STB_DXT_NORMAL 0
31 #define STB_DXT_DITHER 1 // use dithering. dubious win. never use for normal maps and the like!
32 #define STB_DXT_HIGHQUAL 2 // high quality mode, does two refinement steps instead of 1. ~30-40% slower.
33
34 void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode);
35 #define STB_COMPRESS_DXT_BLOCK
36
37 #ifdef STB_DXT_IMPLEMENTATION
38
39 // configuration options for DXT encoder. set them in the project/makefile or just define
40 // them at the top.
41
42 // STB_DXT_USE_ROUNDING_BIAS
43 // use a rounding bias during color interpolation. this is closer to what "ideal"
44 // interpolation would do but doesn't match the S3TC/DX10 spec. old versions (pre-1.03)
45 // implicitly had this turned on.
46 //
47 // in case you're targeting a specific type of hardware (e.g. console programmers):
48 // NVidia and Intel GPUs (as of 2010) as well as DX9 ref use DXT decoders that are closer
49 // to STB_DXT_USE_ROUNDING_BIAS. AMD/ATI, S3 and DX10 ref are closer to rounding with no bias.
50 // you also see "(a*5 + b*3) / 8" on some old GPU designs.
51 // #define STB_DXT_USE_ROUNDING_BIAS
52
53 #include <stdlib.h>
54 #include <math.h>
55 #include <string.h> // memset
56
57 static unsigned char stb__Expand5[32];
58 static unsigned char stb__Expand6[64];
59 static unsigned char stb__OMatch5[256][2];
60 static unsigned char stb__OMatch6[256][2];
61 static unsigned char stb__QuantRBTab[256+16];
62 static unsigned char stb__QuantGTab[256+16];
63
64 static int stb__Mul8Bit(int a, int b)
65 {
66 int t = a*b + 128;
67 return (t + (t >> 8)) >> 8;
68 }
69
70 static void stb__From16Bit(unsigned char *out, unsigned short v)
71 {
72 int rv = (v & 0xf800) >> 11;
73 int gv = (v & 0x07e0) >> 5;
74 int bv = (v & 0x001f) >> 0;
75
76 out[0] = stb__Expand5[rv];
77 out[1] = stb__Expand6[gv];
78 out[2] = stb__Expand5[bv];
79 out[3] = 0;
80 }
81
82 static unsigned short stb__As16Bit(int r, int g, int b)
83 {
84 return (stb__Mul8Bit(r,31) << 11) + (stb__Mul8Bit(g,63) << 5) + stb__Mul8Bit(b,31);
85 }
86
87 // linear interpolation at 1/3 point between a and b, using desired rounding type
88 static int stb__Lerp13(int a, int b)
89 {
90 #ifdef STB_DXT_USE_ROUNDING_BIAS
91 // with rounding bias
92 return a + stb__Mul8Bit(b-a, 0x55);
93 #else
94 // without rounding bias
95 // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
96 return (2*a + b) / 3;
97 #endif
98 }
99
100 // lerp RGB color
101 static void stb__Lerp13RGB(unsigned char *out, unsigned char *p1, unsigned char *p2)
102 {
103 out[0] = stb__Lerp13(p1[0], p2[0]);
104 out[1] = stb__Lerp13(p1[1], p2[1]);
105 out[2] = stb__Lerp13(p1[2], p2[2]);
106 }
107
108 /****************************************************************************/
109
110 // compute table to reproduce constant colors as accurately as possible
111 static void stb__PrepareOptTable(unsigned char *Table,const unsigned char *expand,int size)
112 {
113 int i,mn,mx;
114 for (i=0;i<256;i++) {
115 int bestErr = 256;
116 for (mn=0;mn<size;mn++) {
117 for (mx=0;mx<size;mx++) {
118 int mine = expand[mn];
119 int maxe = expand[mx];
120 int err = abs(stb__Lerp13(maxe, mine) - i);
121
122 // DX10 spec says that interpolation must be within 3% of "correct" result,
123 // add this as error term. (normally we'd expect a random distribution of
124 // +-1.5% error, but nowhere in the spec does it say that the error has to be
125 // unbiased - better safe than sorry).
126 err += abs(maxe - mine) * 3 / 100;
127
128 if(err < bestErr)
129 {
130 Table[i*2+0] = mx;
131 Table[i*2+1] = mn;
132 bestErr = err;
133 }
134 }
135 }
136 }
137 }
138
139 static void stb__EvalColors(unsigned char *color,unsigned short c0,unsigned short c1)
140 {
141 stb__From16Bit(color+ 0, c0);
142 stb__From16Bit(color+ 4, c1);
143 stb__Lerp13RGB(color+ 8, color+0, color+4);
144 stb__Lerp13RGB(color+12, color+4, color+0);
145 }
146
147 // Block dithering function. Simply dithers a block to 565 RGB.
148 // (Floyd-Steinberg)
149 static void stb__DitherBlock(unsigned char *dest, unsigned char *block)
150 {
151 int err[8],*ep1 = err,*ep2 = err+4, *et;
152 int ch,y;
153
154 // process channels seperately
155 for (ch=0; ch<3; ++ch) {
156 unsigned char *bp = block+ch, *dp = dest+ch;
157 unsigned char *quant = (ch == 1) ? stb__QuantGTab+8 : stb__QuantRBTab+8;
158 memset(err, 0, sizeof(err));
159 for(y=0; y<4; ++y) {
160 dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
161 ep1[0] = bp[ 0] - dp[ 0];
162 dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)];
163 ep1[1] = bp[ 4] - dp[ 4];
164 dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)];
165 ep1[2] = bp[ 8] - dp[ 8];
166 dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)];
167 ep1[3] = bp[12] - dp[12];
168 bp += 16;
169 dp += 16;
170 et = ep1, ep1 = ep2, ep2 = et; // swap
171 }
172 }
173 }
174
175 // The color matching function
176 static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color,int dither)
177 {
178 unsigned int mask = 0;
179 int dirr = color[0*4+0] - color[1*4+0];
180 int dirg = color[0*4+1] - color[1*4+1];
181 int dirb = color[0*4+2] - color[1*4+2];
182 int dots[16];
183 int stops[4];
184 int i;
185 int c0Point, halfPoint, c3Point;
186
187 for(i=0;i<16;i++)
188 dots[i] = block[i*4+0]*dirr + block[i*4+1]*dirg + block[i*4+2]*dirb;
189
190 for(i=0;i<4;i++)
191 stops[i] = color[i*4+0]*dirr + color[i*4+1]*dirg + color[i*4+2]*dirb;
192
193 // think of the colors as arranged on a line; project point onto that line, then choose
194 // next color out of available ones. we compute the crossover points for "best color in top
195 // half"/"best in bottom half" and then the same inside that subinterval.
196 //
197 // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
198 // but it's very close and a lot faster.
199 // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
200
201 c0Point = (stops[1] + stops[3]) >> 1;
202 halfPoint = (stops[3] + stops[2]) >> 1;
203 c3Point = (stops[2] + stops[0]) >> 1;
204
205 if(!dither) {
206 // the version without dithering is straightforward
207 for (i=15;i>=0;i--) {
208 int dot = dots[i];
209 mask <<= 2;
210
211 if(dot < halfPoint)
212 mask |= (dot < c0Point) ? 1 : 3;
213 else
214 mask |= (dot < c3Point) ? 2 : 0;
215 }
216 } else {
217 // with floyd-steinberg dithering
218 int err[8],*ep1 = err,*ep2 = err+4;
219 int *dp = dots, y;
220
221 c0Point <<= 4;
222 halfPoint <<= 4;
223 c3Point <<= 4;
224 for(i=0;i<8;i++)
225 err[i] = 0;
226
227 for(y=0;y<4;y++)
228 {
229 int dot,lmask,step;
230
231 dot = (dp[0] << 4) + (3*ep2[1] + 5*ep2[0]);
232 if(dot < halfPoint)
233 step = (dot < c0Point) ? 1 : 3;
234 else
235 step = (dot < c3Point) ? 2 : 0;
236 ep1[0] = dp[0] - stops[step];
237 lmask = step;
238
239 dot = (dp[1] << 4) + (7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]);
240 if(dot < halfPoint)
241 step = (dot < c0Point) ? 1 : 3;
242 else
243 step = (dot < c3Point) ? 2 : 0;
244 ep1[1] = dp[1] - stops[step];
245 lmask |= step<<2;
246
247 dot = (dp[2] << 4) + (7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]);
248 if(dot < halfPoint)
249 step = (dot < c0Point) ? 1 : 3;
250 else
251 step = (dot < c3Point) ? 2 : 0;
252 ep1[2] = dp[2] - stops[step];
253 lmask |= step<<4;
254
255 dot = (dp[3] << 4) + (7*ep1[2] + 5*ep2[3] + ep2[2]);
256 if(dot < halfPoint)
257 step = (dot < c0Point) ? 1 : 3;
258 else
259 step = (dot < c3Point) ? 2 : 0;
260 ep1[3] = dp[3] - stops[step];
261 lmask |= step<<6;
262
263 dp += 4;
264 mask |= lmask << (y*8);
265 { int *et = ep1; ep1 = ep2; ep2 = et; } // swap
266 }
267 }
268
269 return mask;
270 }
271
272 // The color optimization function. (Clever code, part 1)
273 static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16)
274 {
275 int mind = 0x7fffffff,maxd = -0x7fffffff;
276 unsigned char *minp, *maxp;
277 double magn;
278 int v_r,v_g,v_b;
279 static const int nIterPower = 4;
280 float covf[6],vfr,vfg,vfb;
281
282 // determine color distribution
283 int cov[6];
284 int mu[3],min[3],max[3];
285 int ch,i,iter;
286
287 for(ch=0;ch<3;ch++)
288 {
289 const unsigned char *bp = ((const unsigned char *) block) + ch;
290 int muv,minv,maxv;
291
292 muv = minv = maxv = bp[0];
293 for(i=4;i<64;i+=4)
294 {
295 muv += bp[i];
296 if (bp[i] < minv) minv = bp[i];
297 else if (bp[i] > maxv) maxv = bp[i];
298 }
299
300 mu[ch] = (muv + 8) >> 4;
301 min[ch] = minv;
302 max[ch] = maxv;
303 }
304
305 // determine covariance matrix
306 for (i=0;i<6;i++)
307 cov[i] = 0;
308
309 for (i=0;i<16;i++)
310 {
311 int r = block[i*4+0] - mu[0];
312 int g = block[i*4+1] - mu[1];
313 int b = block[i*4+2] - mu[2];
314
315 cov[0] += r*r;
316 cov[1] += r*g;
317 cov[2] += r*b;
318 cov[3] += g*g;
319 cov[4] += g*b;
320 cov[5] += b*b;
321 }
322
323 // convert covariance matrix to float, find principal axis via power iter
324 for(i=0;i<6;i++)
325 covf[i] = cov[i] / 255.0f;
326
327 vfr = (float) (max[0] - min[0]);
328 vfg = (float) (max[1] - min[1]);
329 vfb = (float) (max[2] - min[2]);
330
331 for(iter=0;iter<nIterPower;iter++)
332 {
333 float r = vfr*covf[0] + vfg*covf[1] + vfb*covf[2];
334 float g = vfr*covf[1] + vfg*covf[3] + vfb*covf[4];
335 float b = vfr*covf[2] + vfg*covf[4] + vfb*covf[5];
336
337 vfr = r;
338 vfg = g;
339 vfb = b;
340 }
341
342 magn = fabs(vfr);
343 if (fabs(vfg) > magn) magn = fabs(vfg);
344 if (fabs(vfb) > magn) magn = fabs(vfb);
345
346 if(magn < 4.0f) { // too small, default to luminance
347 v_r = 299; // JPEG YCbCr luma coefs, scaled by 1000.
348 v_g = 587;
349 v_b = 114;
350 } else {
351 magn = 512.0 / magn;
352 v_r = (int) (vfr * magn);
353 v_g = (int) (vfg * magn);
354 v_b = (int) (vfb * magn);
355 }
356
357 // Pick colors at extreme points
358 for(i=0;i<16;i++)
359 {
360 int dot = block[i*4+0]*v_r + block[i*4+1]*v_g + block[i*4+2]*v_b;
361
362 if (dot < mind) {
363 mind = dot;
364 minp = block+i*4;
365 }
366
367 if (dot > maxd) {
368 maxd = dot;
369 maxp = block+i*4;
370 }
371 }
372
373 *pmax16 = stb__As16Bit(maxp[0],maxp[1],maxp[2]);
374 *pmin16 = stb__As16Bit(minp[0],minp[1],minp[2]);
375 }
376
377 static int stb__sclamp(float y, int p0, int p1)
378 {
379 int x = (int) y;
380 if (x < p0) return p0;
381 if (x > p1) return p1;
382 return x;
383 }
384
385 // The refinement function. (Clever code, part 2)
386 // Tries to optimize colors to suit block contents better.
387 // (By solving a least squares system via normal equations+Cramer's rule)
388 static int stb__RefineBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16, unsigned int mask)
389 {
390 static const int w1Tab[4] = { 3,0,2,1 };
391 static const int prods[4] = { 0x090000,0x000900,0x040102,0x010402 };
392 // ^some magic to save a lot of multiplies in the accumulating loop...
393 // (precomputed products of weights for least squares system, accumulated inside one 32-bit register)
394
395 float frb,fg;
396 unsigned short oldMin, oldMax, min16, max16;
397 int i, akku = 0, xx,xy,yy;
398 int At1_r,At1_g,At1_b;
399 int At2_r,At2_g,At2_b;
400 unsigned int cm = mask;
401
402 oldMin = *pmin16;
403 oldMax = *pmax16;
404
405 if((mask ^ (mask<<2)) < 4) // all pixels have the same index?
406 {
407 // yes, linear system would be singular; solve using optimal
408 // single-color match on average color
409 int r = 8, g = 8, b = 8;
410 for (i=0;i<16;++i) {
411 r += block[i*4+0];
412 g += block[i*4+1];
413 b += block[i*4+2];
414 }
415
416 r >>= 4; g >>= 4; b >>= 4;
417
418 max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
419 min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
420 } else {
421 At1_r = At1_g = At1_b = 0;
422 At2_r = At2_g = At2_b = 0;
423 for (i=0;i<16;++i,cm>>=2) {
424 int step = cm&3;
425 int w1 = w1Tab[step];
426 int r = block[i*4+0];
427 int g = block[i*4+1];
428 int b = block[i*4+2];
429
430 akku += prods[step];
431 At1_r += w1*r;
432 At1_g += w1*g;
433 At1_b += w1*b;
434 At2_r += r;
435 At2_g += g;
436 At2_b += b;
437 }
438
439 At2_r = 3*At2_r - At1_r;
440 At2_g = 3*At2_g - At1_g;
441 At2_b = 3*At2_b - At1_b;
442
443 // extract solutions and decide solvability
444 xx = akku >> 16;
445 yy = (akku >> 8) & 0xff;
446 xy = (akku >> 0) & 0xff;
447
448 frb = 3.0f * 31.0f / 255.0f / (xx*yy - xy*xy);
449 fg = frb * 63.0f / 31.0f;
450
451 // solve.
452 max16 = stb__sclamp((At1_r*yy - At2_r*xy)*frb+0.5f,0,31) << 11;
453 max16 |= stb__sclamp((At1_g*yy - At2_g*xy)*fg +0.5f,0,63) << 5;
454 max16 |= stb__sclamp((At1_b*yy - At2_b*xy)*frb+0.5f,0,31) << 0;
455
456 min16 = stb__sclamp((At2_r*xx - At1_r*xy)*frb+0.5f,0,31) << 11;
457 min16 |= stb__sclamp((At2_g*xx - At1_g*xy)*fg +0.5f,0,63) << 5;
458 min16 |= stb__sclamp((At2_b*xx - At1_b*xy)*frb+0.5f,0,31) << 0;
459 }
460
461 *pmin16 = min16;
462 *pmax16 = max16;
463 return oldMin != min16 || oldMax != max16;
464 }
465
466 // Color block compression
467 static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, int mode)
468 {
469 unsigned int mask;
470 int i;
471 int dither;
472 int refinecount;
473 unsigned short max16, min16;
474 unsigned char dblock[16*4],color[4*4];
475
476 dither = mode & STB_DXT_DITHER;
477 refinecount = (mode & STB_DXT_HIGHQUAL) ? 2 : 1;
478
479 // check if block is constant
480 for (i=1;i<16;i++)
481 if (((unsigned int *) block)[i] != ((unsigned int *) block)[0])
482 break;
483
484 if(i == 16) { // constant color
485 int r = block[0], g = block[1], b = block[2];
486 mask = 0xaaaaaaaa;
487 max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
488 min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
489 } else {
490 // first step: compute dithered version for PCA if desired
491 if(dither)
492 stb__DitherBlock(dblock,block);
493
494 // second step: pca+map along principal axis
495 stb__OptimizeColorsBlock(dither ? dblock : block,&max16,&min16);
496 if (max16 != min16) {
497 stb__EvalColors(color,max16,min16);
498 mask = stb__MatchColorsBlock(block,color,dither);
499 } else
500 mask = 0;
501
502 // third step: refine (multiple times if requested)
503 for (i=0;i<refinecount;i++) {
504 unsigned int lastmask = mask;
505
506 if (stb__RefineBlock(dither ? dblock : block,&max16,&min16,mask)) {
507 if (max16 != min16) {
508 stb__EvalColors(color,max16,min16);
509 mask = stb__MatchColorsBlock(block,color,dither);
510 } else {
511 mask = 0;
512 break;
513 }
514 }
515
516 if(mask == lastmask)
517 break;
518 }
519 }
520
521 // write the color block
522 if(max16 < min16)
523 {
524 unsigned short t = min16;
525 min16 = max16;
526 max16 = t;
527 mask ^= 0x55555555;
528 }
529
530 dest[0] = (unsigned char) (max16);
531 dest[1] = (unsigned char) (max16 >> 8);
532 dest[2] = (unsigned char) (min16);
533 dest[3] = (unsigned char) (min16 >> 8);
534 dest[4] = (unsigned char) (mask);
535 dest[5] = (unsigned char) (mask >> 8);
536 dest[6] = (unsigned char) (mask >> 16);
537 dest[7] = (unsigned char) (mask >> 24);
538 }
539
540 // Alpha block compression (this is easy for a change)
541 static void stb__CompressAlphaBlock(unsigned char *dest,unsigned char *src,int mode)
542 {
543 int i,dist,bias,dist4,dist2,bits,mask;
544
545 // find min/max color
546 int mn,mx;
547 mn = mx = src[3];
548
549 for (i=1;i<16;i++)
550 {
551 if (src[i*4+3] < mn) mn = src[i*4+3];
552 else if (src[i*4+3] > mx) mx = src[i*4+3];
553 }
554
555 // encode them
556 ((unsigned char *)dest)[0] = mx;
557 ((unsigned char *)dest)[1] = mn;
558 dest += 2;
559
560 // determine bias and emit color indices
561 // given the choice of mx/mn, these indices are optimal:
562 // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
563 dist = mx-mn;
564 dist4 = dist*4;
565 dist2 = dist*2;
566 bias = (dist < 8) ? (dist - 1) : (dist/2 + 2);
567 bias -= mn * 7;
568 bits = 0,mask=0;
569
570 for (i=0;i<16;i++) {
571 int a = src[i*4+3]*7 + bias;
572 int ind,t;
573
574 // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
575 t = (a >= dist4) ? -1 : 0; ind = t & 4; a -= dist4 & t;
576 t = (a >= dist2) ? -1 : 0; ind += t & 2; a -= dist2 & t;
577 ind += (a >= dist);
578
579 // turn linear scale into DXT index (0/1 are extremal pts)
580 ind = -ind & 7;
581 ind ^= (2 > ind);
582
583 // write index
584 mask |= ind << bits;
585 if((bits += 3) >= 8) {
586 *dest++ = mask;
587 mask >>= 8;
588 bits -= 8;
589 }
590 }
591 }
592
593 static void stb__InitDXT()
594 {
595 int i;
596 for(i=0;i<32;i++)
597 stb__Expand5[i] = (i<<3)|(i>>2);
598
599 for(i=0;i<64;i++)
600 stb__Expand6[i] = (i<<2)|(i>>4);
601
602 for(i=0;i<256+16;i++)
603 {
604 int v = i-8 < 0 ? 0 : i-8 > 255 ? 255 : i-8;
605 stb__QuantRBTab[i] = stb__Expand5[stb__Mul8Bit(v,31)];
606 stb__QuantGTab[i] = stb__Expand6[stb__Mul8Bit(v,63)];
607 }
608
609 stb__PrepareOptTable(&stb__OMatch5[0][0],stb__Expand5,32);
610 stb__PrepareOptTable(&stb__OMatch6[0][0],stb__Expand6,64);
611 }
612
613 void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode)
614 {
615 static int init=1;
616 if (init) {
617 stb__InitDXT();
618 init=0;
619 }
620
621 if (alpha) {
622 stb__CompressAlphaBlock(dest,(unsigned char*) src,mode);
623 dest += 8;
624 }
625
626 stb__CompressColorBlock(dest,(unsigned char*) src,mode);
627 }
628 #endif // STB_DXT_IMPLEMENTATION
629
630 #endif // STB_INCLUDE_STB_DXT_H