polyadvent

A game engine from scratch in C
git clone git://jb55.com/polyadvent
Log | Files | Refs | README

stb_image.c (227313B)


      1 
      2 #include "stb_image.h"
      3 
      4 #if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
      5   || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
      6   || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
      7   || defined(STBI_ONLY_ZLIB)
      8    #ifndef STBI_ONLY_JPEG
      9    #define STBI_NO_JPEG
     10    #endif
     11    #ifndef STBI_ONLY_PNG
     12    #define STBI_NO_PNG
     13    #endif
     14    #ifndef STBI_ONLY_BMP
     15    #define STBI_NO_BMP
     16    #endif
     17    #ifndef STBI_ONLY_PSD
     18    #define STBI_NO_PSD
     19    #endif
     20    #ifndef STBI_ONLY_TGA
     21    #define STBI_NO_TGA
     22    #endif
     23    #ifndef STBI_ONLY_GIF
     24    #define STBI_NO_GIF
     25    #endif
     26    #ifndef STBI_ONLY_HDR
     27    #define STBI_NO_HDR
     28    #endif
     29    #ifndef STBI_ONLY_PIC
     30    #define STBI_NO_PIC
     31    #endif
     32    #ifndef STBI_ONLY_PNM
     33    #define STBI_NO_PNM
     34    #endif
     35 #endif
     36 
     37 #if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
     38 #define STBI_NO_ZLIB
     39 #endif
     40 
     41 
     42 #include <stdarg.h>
     43 #include <stddef.h> // ptrdiff_t on osx
     44 #include <stdlib.h>
     45 #include <string.h>
     46 #include <limits.h>
     47 
     48 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
     49 #include <math.h>  // ldexp, pow
     50 #endif
     51 
     52 #ifndef STBI_NO_STDIO
     53 #include <stdio.h>
     54 #endif
     55 
     56 #ifndef STBI_ASSERT
     57 #include <assert.h>
     58 #define STBI_ASSERT(x) assert(x)
     59 #endif
     60 
     61 
     62 #ifndef _MSC_VER
     63    #ifdef __cplusplus
     64    #define stbi_inline inline
     65    #else
     66    #define stbi_inline
     67    #endif
     68 #else
     69    #define stbi_inline __forceinline
     70 #endif
     71 
     72 
     73 #ifdef _MSC_VER
     74 typedef unsigned short stbi__uint16;
     75 typedef   signed short stbi__int16;
     76 typedef unsigned int   stbi__uint32;
     77 typedef   signed int   stbi__int32;
     78 #else
     79 #include <stdint.h>
     80 typedef uint16_t stbi__uint16;
     81 typedef int16_t  stbi__int16;
     82 typedef uint32_t stbi__uint32;
     83 typedef int32_t  stbi__int32;
     84 #endif
     85 
     86 // should produce compiler error if size is wrong
     87 typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
     88 
     89 #ifdef _MSC_VER
     90 #define STBI_NOTUSED(v)  (void)(v)
     91 #else
     92 #define STBI_NOTUSED(v)  (void)sizeof(v)
     93 #endif
     94 
     95 #ifdef _MSC_VER
     96 #define STBI_HAS_LROTL
     97 #endif
     98 
     99 #ifdef STBI_HAS_LROTL
    100    #define stbi_lrot(x,y)  _lrotl(x,y)
    101 #else
    102    #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
    103 #endif
    104 
    105 #if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
    106 // ok
    107 #elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
    108 // ok
    109 #else
    110 #error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
    111 #endif
    112 
    113 #ifndef STBI_MALLOC
    114 #define STBI_MALLOC(sz)           malloc(sz)
    115 #define STBI_REALLOC(p,newsz)     realloc(p,newsz)
    116 #define STBI_FREE(p)              free(p)
    117 #endif
    118 
    119 #ifndef STBI_REALLOC_SIZED
    120 #define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
    121 #endif
    122 
    123 // x86/x64 detection
    124 #if defined(__x86_64__) || defined(_M_X64)
    125 #define STBI__X64_TARGET
    126 #elif defined(__i386) || defined(_M_IX86)
    127 #define STBI__X86_TARGET
    128 #endif
    129 
    130 #if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
    131 // gcc doesn't support sse2 intrinsics unless you compile with -msse2,
    132 // which in turn means it gets to use SSE2 everywhere. This is unfortunate,
    133 // but previous attempts to provide the SSE2 functions with runtime
    134 // detection caused numerous issues. The way architecture extensions are
    135 // exposed in GCC/Clang is, sadly, not really suited for one-file libs.
    136 // New behavior: if compiled with -msse2, we use SSE2 without any
    137 // detection; if not, we don't use it at all.
    138 #define STBI_NO_SIMD
    139 #endif
    140 
    141 #if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
    142 // Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
    143 //
    144 // 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
    145 // Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
    146 // As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
    147 // simultaneously enabling "-mstackrealign".
    148 //
    149 // See https://github.com/nothings/stb/issues/81 for more information.
    150 //
    151 // So default to no SSE2 on 32-bit MinGW. If you've read this far and added
    152 // -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
    153 #define STBI_NO_SIMD
    154 #endif
    155 
    156 #if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
    157 #define STBI_SSE2
    158 #include <emmintrin.h>
    159 
    160 #ifdef _MSC_VER
    161 
    162 #if _MSC_VER >= 1400  // not VC6
    163 #include <intrin.h> // __cpuid
    164 static int stbi__cpuid3(void)
    165 {
    166    int info[4];
    167    __cpuid(info,1);
    168    return info[3];
    169 }
    170 #else
    171 static int stbi__cpuid3(void)
    172 {
    173    int res;
    174    __asm {
    175       mov  eax,1
    176       cpuid
    177       mov  res,edx
    178    }
    179    return res;
    180 }
    181 #endif
    182 
    183 #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
    184 
    185 static int stbi__sse2_available(void)
    186 {
    187    int info3 = stbi__cpuid3();
    188    return ((info3 >> 26) & 1) != 0;
    189 }
    190 #else // assume GCC-style if not VC++
    191 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
    192 
    193 static int stbi__sse2_available(void)
    194 {
    195    // If we're even attempting to compile this on GCC/Clang, that means
    196    // -msse2 is on, which means the compiler is allowed to use SSE2
    197    // instructions at will, and so are we.
    198    return 1;
    199 }
    200 #endif
    201 #endif
    202 
    203 // ARM NEON
    204 #if defined(STBI_NO_SIMD) && defined(STBI_NEON)
    205 #undef STBI_NEON
    206 #endif
    207 
    208 #ifdef STBI_NEON
    209 #include <arm_neon.h>
    210 // assume GCC or Clang on ARM targets
    211 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
    212 #endif
    213 
    214 #ifndef STBI_SIMD_ALIGN
    215 #define STBI_SIMD_ALIGN(type, name) type name
    216 #endif
    217 
    218 ///////////////////////////////////////////////
    219 //
    220 //  stbi__context struct and start_xxx functions
    221 
    222 // stbi__context structure is our basic context used by all images, so it
    223 // contains all the IO context, plus some basic image information
    224 typedef struct
    225 {
    226    stbi__uint32 img_x, img_y;
    227    int img_n, img_out_n;
    228 
    229    stbi_io_callbacks io;
    230    void *io_user_data;
    231 
    232    int read_from_callbacks;
    233    int buflen;
    234    stbi_uc buffer_start[128];
    235 
    236    stbi_uc *img_buffer, *img_buffer_end;
    237    stbi_uc *img_buffer_original, *img_buffer_original_end;
    238 } stbi__context;
    239 
    240 
    241 static void stbi__refill_buffer(stbi__context *s);
    242 
    243 // initialize a memory-decode context
    244 static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
    245 {
    246    s->io.read = NULL;
    247    s->read_from_callbacks = 0;
    248    s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
    249    s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
    250 }
    251 
    252 // initialize a callback-based context
    253 static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
    254 {
    255    s->io = *c;
    256    s->io_user_data = user;
    257    s->buflen = sizeof(s->buffer_start);
    258    s->read_from_callbacks = 1;
    259    s->img_buffer_original = s->buffer_start;
    260    stbi__refill_buffer(s);
    261    s->img_buffer_original_end = s->img_buffer_end;
    262 }
    263 
    264 #ifndef STBI_NO_STDIO
    265 
    266 static int stbi__stdio_read(void *user, char *data, int size)
    267 {
    268    return (int) fread(data,1,size,(FILE*) user);
    269 }
    270 
    271 static void stbi__stdio_skip(void *user, int n)
    272 {
    273    fseek((FILE*) user, n, SEEK_CUR);
    274 }
    275 
    276 static int stbi__stdio_eof(void *user)
    277 {
    278    return feof((FILE*) user);
    279 }
    280 
    281 static stbi_io_callbacks stbi__stdio_callbacks =
    282 {
    283    stbi__stdio_read,
    284    stbi__stdio_skip,
    285    stbi__stdio_eof,
    286 };
    287 
    288 static void stbi__start_file(stbi__context *s, FILE *f)
    289 {
    290    stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
    291 }
    292 
    293 //static void stop_file(stbi__context *s) { }
    294 
    295 #endif // !STBI_NO_STDIO
    296 
    297 static void stbi__rewind(stbi__context *s)
    298 {
    299    // conceptually rewind SHOULD rewind to the beginning of the stream,
    300    // but we just rewind to the beginning of the initial buffer, because
    301    // we only use it after doing 'test', which only ever looks at at most 92 bytes
    302    s->img_buffer = s->img_buffer_original;
    303    s->img_buffer_end = s->img_buffer_original_end;
    304 }
    305 
    306 enum
    307 {
    308    STBI_ORDER_RGB,
    309    STBI_ORDER_BGR
    310 };
    311 
    312 typedef struct
    313 {
    314    int bits_per_channel;
    315    int num_channels;
    316    int channel_order;
    317 } stbi__result_info;
    318 
    319 #ifndef STBI_NO_JPEG
    320 static int      stbi__jpeg_test(stbi__context *s);
    321 static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
    322 static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
    323 #endif
    324 
    325 #ifndef STBI_NO_PNG
    326 static int      stbi__png_test(stbi__context *s);
    327 static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
    328 static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
    329 static int      stbi__png_is16(stbi__context *s);
    330 #endif
    331 
    332 #ifndef STBI_NO_BMP
    333 static int      stbi__bmp_test(stbi__context *s);
    334 static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
    335 static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
    336 #endif
    337 
    338 #ifndef STBI_NO_TGA
    339 static int      stbi__tga_test(stbi__context *s);
    340 static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
    341 static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
    342 #endif
    343 
    344 #ifndef STBI_NO_PSD
    345 static int      stbi__psd_test(stbi__context *s);
    346 static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
    347 static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
    348 static int      stbi__psd_is16(stbi__context *s);
    349 #endif
    350 
    351 #ifndef STBI_NO_HDR
    352 static int      stbi__hdr_test(stbi__context *s);
    353 static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
    354 static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
    355 #endif
    356 
    357 #ifndef STBI_NO_PIC
    358 static int      stbi__pic_test(stbi__context *s);
    359 static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
    360 static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
    361 #endif
    362 
    363 #ifndef STBI_NO_GIF
    364 static int      stbi__gif_test(stbi__context *s);
    365 static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
    366 static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
    367 static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
    368 #endif
    369 
    370 #ifndef STBI_NO_PNM
    371 static int      stbi__pnm_test(stbi__context *s);
    372 static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
    373 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
    374 #endif
    375 
    376 // this is not threadsafe
    377 static const char *stbi__g_failure_reason;
    378 
    379 STBIDEF const char *stbi_failure_reason(void)
    380 {
    381    return stbi__g_failure_reason;
    382 }
    383 
    384 static int stbi__err(const char *str)
    385 {
    386    stbi__g_failure_reason = str;
    387    return 0;
    388 }
    389 
    390 static void *stbi__malloc(size_t size)
    391 {
    392     return STBI_MALLOC(size);
    393 }
    394 
    395 // stb_image uses ints pervasively, including for offset calculations.
    396 // therefore the largest decoded image size we can support with the
    397 // current code, even on 64-bit targets, is INT_MAX. this is not a
    398 // significant limitation for the intended use case.
    399 //
    400 // we do, however, need to make sure our size calculations don't
    401 // overflow. hence a few helper functions for size calculations that
    402 // multiply integers together, making sure that they're non-negative
    403 // and no overflow occurs.
    404 
    405 // return 1 if the sum is valid, 0 on overflow.
    406 // negative terms are considered invalid.
    407 static int stbi__addsizes_valid(int a, int b)
    408 {
    409    if (b < 0) return 0;
    410    // now 0 <= b <= INT_MAX, hence also
    411    // 0 <= INT_MAX - b <= INTMAX.
    412    // And "a + b <= INT_MAX" (which might overflow) is the
    413    // same as a <= INT_MAX - b (no overflow)
    414    return a <= INT_MAX - b;
    415 }
    416 
    417 // returns 1 if the product is valid, 0 on overflow.
    418 // negative factors are considered invalid.
    419 static int stbi__mul2sizes_valid(int a, int b)
    420 {
    421    if (a < 0 || b < 0) return 0;
    422    if (b == 0) return 1; // mul-by-0 is always safe
    423    // portable way to check for no overflows in a*b
    424    return a <= INT_MAX/b;
    425 }
    426 
    427 // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
    428 static int stbi__mad2sizes_valid(int a, int b, int add)
    429 {
    430    return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
    431 }
    432 
    433 // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
    434 static int stbi__mad3sizes_valid(int a, int b, int c, int add)
    435 {
    436    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
    437       stbi__addsizes_valid(a*b*c, add);
    438 }
    439 
    440 // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
    441 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
    442 static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
    443 {
    444    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
    445       stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
    446 }
    447 #endif
    448 
    449 // mallocs with size overflow checking
    450 static void *stbi__malloc_mad2(int a, int b, int add)
    451 {
    452    if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
    453    return stbi__malloc(a*b + add);
    454 }
    455 
    456 static void *stbi__malloc_mad3(int a, int b, int c, int add)
    457 {
    458    if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
    459    return stbi__malloc(a*b*c + add);
    460 }
    461 
    462 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
    463 static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
    464 {
    465    if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
    466    return stbi__malloc(a*b*c*d + add);
    467 }
    468 #endif
    469 
    470 // stbi__err - error
    471 // stbi__errpf - error returning pointer to float
    472 // stbi__errpuc - error returning pointer to unsigned char
    473 
    474 #ifdef STBI_NO_FAILURE_STRINGS
    475    #define stbi__err(x,y)  0
    476 #elif defined(STBI_FAILURE_USERMSG)
    477    #define stbi__err(x,y)  stbi__err(y)
    478 #else
    479    #define stbi__err(x,y)  stbi__err(x)
    480 #endif
    481 
    482 #define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
    483 #define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
    484 
    485 STBIDEF void stbi_image_free(void *retval_from_stbi_load)
    486 {
    487    STBI_FREE(retval_from_stbi_load);
    488 }
    489 
    490 #ifndef STBI_NO_LINEAR
    491 static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
    492 #endif
    493 
    494 #ifndef STBI_NO_HDR
    495 static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
    496 #endif
    497 
    498 static int stbi__vertically_flip_on_load = 0;
    499 
    500 STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
    501 {
    502     stbi__vertically_flip_on_load = flag_true_if_should_flip;
    503 }
    504 
    505 static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
    506 {
    507    memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
    508    ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
    509    ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
    510    ri->num_channels = 0;
    511 
    512    #ifndef STBI_NO_JPEG
    513    if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
    514    #endif
    515    #ifndef STBI_NO_PNG
    516    if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
    517    #endif
    518    #ifndef STBI_NO_BMP
    519    if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
    520    #endif
    521    #ifndef STBI_NO_GIF
    522    if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
    523    #endif
    524    #ifndef STBI_NO_PSD
    525    if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
    526    #endif
    527    #ifndef STBI_NO_PIC
    528    if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
    529    #endif
    530    #ifndef STBI_NO_PNM
    531    if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
    532    #endif
    533 
    534    #ifndef STBI_NO_HDR
    535    if (stbi__hdr_test(s)) {
    536       float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
    537       return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
    538    }
    539    #endif
    540 
    541    #ifndef STBI_NO_TGA
    542    // test tga last because it's a crappy test!
    543    if (stbi__tga_test(s))
    544       return stbi__tga_load(s,x,y,comp,req_comp, ri);
    545    #endif
    546 
    547    return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
    548 }
    549 
    550 static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
    551 {
    552    int i;
    553    int img_len = w * h * channels;
    554    stbi_uc *reduced;
    555 
    556    reduced = (stbi_uc *) stbi__malloc(img_len);
    557    if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
    558 
    559    for (i = 0; i < img_len; ++i)
    560       reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
    561 
    562    STBI_FREE(orig);
    563    return reduced;
    564 }
    565 
    566 static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
    567 {
    568    int i;
    569    int img_len = w * h * channels;
    570    stbi__uint16 *enlarged;
    571 
    572    enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
    573    if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
    574 
    575    for (i = 0; i < img_len; ++i)
    576       enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
    577 
    578    STBI_FREE(orig);
    579    return enlarged;
    580 }
    581 
    582 static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
    583 {
    584    int row;
    585    size_t bytes_per_row = (size_t)w * bytes_per_pixel;
    586    stbi_uc temp[2048];
    587    stbi_uc *bytes = (stbi_uc *)image;
    588 
    589    for (row = 0; row < (h>>1); row++) {
    590       stbi_uc *row0 = bytes + row*bytes_per_row;
    591       stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
    592       // swap row0 with row1
    593       size_t bytes_left = bytes_per_row;
    594       while (bytes_left) {
    595          size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
    596          memcpy(temp, row0, bytes_copy);
    597          memcpy(row0, row1, bytes_copy);
    598          memcpy(row1, temp, bytes_copy);
    599          row0 += bytes_copy;
    600          row1 += bytes_copy;
    601          bytes_left -= bytes_copy;
    602       }
    603    }
    604 }
    605 
    606 static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
    607 {
    608    int slice;
    609    int slice_size = w * h * bytes_per_pixel;
    610 
    611    stbi_uc *bytes = (stbi_uc *)image;
    612    for (slice = 0; slice < z; ++slice) {
    613       stbi__vertical_flip(bytes, w, h, bytes_per_pixel); 
    614       bytes += slice_size; 
    615    }
    616 }
    617 
    618 static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
    619 {
    620    stbi__result_info ri;
    621    void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
    622 
    623    if (result == NULL)
    624       return NULL;
    625 
    626    if (ri.bits_per_channel != 8) {
    627       STBI_ASSERT(ri.bits_per_channel == 16);
    628       result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
    629       ri.bits_per_channel = 8;
    630    }
    631 
    632    // @TODO: move stbi__convert_format to here
    633 
    634    if (stbi__vertically_flip_on_load) {
    635       int channels = req_comp ? req_comp : *comp;
    636       stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
    637    }
    638 
    639    return (unsigned char *) result;
    640 }
    641 
    642 static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
    643 {
    644    stbi__result_info ri;
    645    void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
    646 
    647    if (result == NULL)
    648       return NULL;
    649 
    650    if (ri.bits_per_channel != 16) {
    651       STBI_ASSERT(ri.bits_per_channel == 8);
    652       result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
    653       ri.bits_per_channel = 16;
    654    }
    655 
    656    // @TODO: move stbi__convert_format16 to here
    657    // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
    658 
    659    if (stbi__vertically_flip_on_load) {
    660       int channels = req_comp ? req_comp : *comp;
    661       stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
    662    }
    663 
    664    return (stbi__uint16 *) result;
    665 }
    666 
    667 #if !defined(STBI_NO_HDR) || !defined(STBI_NO_LINEAR)
    668 static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
    669 {
    670    if (stbi__vertically_flip_on_load && result != NULL) {
    671       int channels = req_comp ? req_comp : *comp;
    672       stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
    673    }
    674 }
    675 #endif
    676 
    677 #ifndef STBI_NO_STDIO
    678 
    679 static FILE *stbi__fopen(char const *filename, char const *mode)
    680 {
    681    FILE *f;
    682 #if defined(_MSC_VER) && _MSC_VER >= 1400
    683    if (0 != fopen_s(&f, filename, mode))
    684       f=0;
    685 #else
    686    f = fopen(filename, mode);
    687 #endif
    688    return f;
    689 }
    690 
    691 
    692 STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
    693 {
    694    FILE *f = stbi__fopen(filename, "rb");
    695    unsigned char *result;
    696    if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
    697    result = stbi_load_from_file(f,x,y,comp,req_comp);
    698    fclose(f);
    699    return result;
    700 }
    701 
    702 STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
    703 {
    704    unsigned char *result;
    705    stbi__context s;
    706    stbi__start_file(&s,f);
    707    result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
    708    if (result) {
    709       // need to 'unget' all the characters in the IO buffer
    710       fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
    711    }
    712    return result;
    713 }
    714 
    715 STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
    716 {
    717    stbi__uint16 *result;
    718    stbi__context s;
    719    stbi__start_file(&s,f);
    720    result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
    721    if (result) {
    722       // need to 'unget' all the characters in the IO buffer
    723       fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
    724    }
    725    return result;
    726 }
    727 
    728 STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
    729 {
    730    FILE *f = stbi__fopen(filename, "rb");
    731    stbi__uint16 *result;
    732    if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
    733    result = stbi_load_from_file_16(f,x,y,comp,req_comp);
    734    fclose(f);
    735    return result;
    736 }
    737 
    738 
    739 #endif //!STBI_NO_STDIO
    740 
    741 STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
    742 {
    743    stbi__context s;
    744    stbi__start_mem(&s,buffer,len);
    745    return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
    746 }
    747 
    748 STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
    749 {
    750    stbi__context s;
    751    stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
    752    return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
    753 }
    754 
    755 STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
    756 {
    757    stbi__context s;
    758    stbi__start_mem(&s,buffer,len);
    759    return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
    760 }
    761 
    762 STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
    763 {
    764    stbi__context s;
    765    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
    766    return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
    767 }
    768 
    769 #ifndef STBI_NO_GIF
    770 STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
    771 {
    772    unsigned char *result;
    773    stbi__context s; 
    774    stbi__start_mem(&s,buffer,len); 
    775    
    776    result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
    777    if (stbi__vertically_flip_on_load) {
    778       stbi__vertical_flip_slices( result, *x, *y, *z, *comp ); 
    779    }
    780 
    781    return result; 
    782 }
    783 #endif
    784 
    785 #ifndef STBI_NO_LINEAR
    786 static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
    787 {
    788    unsigned char *data;
    789    #ifndef STBI_NO_HDR
    790    if (stbi__hdr_test(s)) {
    791       stbi__result_info ri;
    792       float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
    793       if (hdr_data)
    794          stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
    795       return hdr_data;
    796    }
    797    #endif
    798    data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
    799    if (data)
    800       return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
    801    return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
    802 }
    803 
    804 STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
    805 {
    806    stbi__context s;
    807    stbi__start_mem(&s,buffer,len);
    808    return stbi__loadf_main(&s,x,y,comp,req_comp);
    809 }
    810 
    811 STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
    812 {
    813    stbi__context s;
    814    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
    815    return stbi__loadf_main(&s,x,y,comp,req_comp);
    816 }
    817 
    818 #ifndef STBI_NO_STDIO
    819 STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
    820 {
    821    float *result;
    822    FILE *f = stbi__fopen(filename, "rb");
    823    if (!f) return stbi__errpf("can't fopen", "Unable to open file");
    824    result = stbi_loadf_from_file(f,x,y,comp,req_comp);
    825    fclose(f);
    826    return result;
    827 }
    828 
    829 STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
    830 {
    831    stbi__context s;
    832    stbi__start_file(&s,f);
    833    return stbi__loadf_main(&s,x,y,comp,req_comp);
    834 }
    835 #endif // !STBI_NO_STDIO
    836 
    837 #endif // !STBI_NO_LINEAR
    838 
    839 // these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
    840 // defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
    841 // reports false!
    842 
    843 STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
    844 {
    845    #ifndef STBI_NO_HDR
    846    stbi__context s;
    847    stbi__start_mem(&s,buffer,len);
    848    return stbi__hdr_test(&s);
    849    #else
    850    STBI_NOTUSED(buffer);
    851    STBI_NOTUSED(len);
    852    return 0;
    853    #endif
    854 }
    855 
    856 #ifndef STBI_NO_STDIO
    857 STBIDEF int      stbi_is_hdr          (char const *filename)
    858 {
    859    FILE *f = stbi__fopen(filename, "rb");
    860    int result=0;
    861    if (f) {
    862       result = stbi_is_hdr_from_file(f);
    863       fclose(f);
    864    }
    865    return result;
    866 }
    867 
    868 STBIDEF int stbi_is_hdr_from_file(FILE *f)
    869 {
    870    #ifndef STBI_NO_HDR
    871    long pos = ftell(f);
    872    int res;
    873    stbi__context s;
    874    stbi__start_file(&s,f);
    875    res = stbi__hdr_test(&s);
    876    fseek(f, pos, SEEK_SET);
    877    return res;
    878    #else
    879    STBI_NOTUSED(f);
    880    return 0;
    881    #endif
    882 }
    883 #endif // !STBI_NO_STDIO
    884 
    885 STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
    886 {
    887    #ifndef STBI_NO_HDR
    888    stbi__context s;
    889    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
    890    return stbi__hdr_test(&s);
    891    #else
    892    STBI_NOTUSED(clbk);
    893    STBI_NOTUSED(user);
    894    return 0;
    895    #endif
    896 }
    897 
    898 #ifndef STBI_NO_LINEAR
    899 static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
    900 
    901 STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
    902 STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
    903 #endif
    904 
    905 static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
    906 
    907 STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
    908 STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
    909 
    910 
    911 //////////////////////////////////////////////////////////////////////////////
    912 //
    913 // Common code used by all image loaders
    914 //
    915 
    916 enum
    917 {
    918    STBI__SCAN_load=0,
    919    STBI__SCAN_type,
    920    STBI__SCAN_header
    921 };
    922 
    923 static void stbi__refill_buffer(stbi__context *s)
    924 {
    925    int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
    926    if (n == 0) {
    927       // at end of file, treat same as if from memory, but need to handle case
    928       // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
    929       s->read_from_callbacks = 0;
    930       s->img_buffer = s->buffer_start;
    931       s->img_buffer_end = s->buffer_start+1;
    932       *s->img_buffer = 0;
    933    } else {
    934       s->img_buffer = s->buffer_start;
    935       s->img_buffer_end = s->buffer_start + n;
    936    }
    937 }
    938 
    939 stbi_inline static stbi_uc stbi__get8(stbi__context *s)
    940 {
    941    if (s->img_buffer < s->img_buffer_end)
    942       return *s->img_buffer++;
    943    if (s->read_from_callbacks) {
    944       stbi__refill_buffer(s);
    945       return *s->img_buffer++;
    946    }
    947    return 0;
    948 }
    949 
    950 stbi_inline static int stbi__at_eof(stbi__context *s)
    951 {
    952    if (s->io.read) {
    953       if (!(s->io.eof)(s->io_user_data)) return 0;
    954       // if feof() is true, check if buffer = end
    955       // special case: we've only got the special 0 character at the end
    956       if (s->read_from_callbacks == 0) return 1;
    957    }
    958 
    959    return s->img_buffer >= s->img_buffer_end;
    960 }
    961 
    962 static void stbi__skip(stbi__context *s, int n)
    963 {
    964    if (n < 0) {
    965       s->img_buffer = s->img_buffer_end;
    966       return;
    967    }
    968    if (s->io.read) {
    969       int blen = (int) (s->img_buffer_end - s->img_buffer);
    970       if (blen < n) {
    971          s->img_buffer = s->img_buffer_end;
    972          (s->io.skip)(s->io_user_data, n - blen);
    973          return;
    974       }
    975    }
    976    s->img_buffer += n;
    977 }
    978 
    979 static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
    980 {
    981    if (s->io.read) {
    982       int blen = (int) (s->img_buffer_end - s->img_buffer);
    983       if (blen < n) {
    984          int res, count;
    985 
    986          memcpy(buffer, s->img_buffer, blen);
    987 
    988          count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
    989          res = (count == (n-blen));
    990          s->img_buffer = s->img_buffer_end;
    991          return res;
    992       }
    993    }
    994 
    995    if (s->img_buffer+n <= s->img_buffer_end) {
    996       memcpy(buffer, s->img_buffer, n);
    997       s->img_buffer += n;
    998       return 1;
    999    } else
   1000       return 0;
   1001 }
   1002 
   1003 static int stbi__get16be(stbi__context *s)
   1004 {
   1005    int z = stbi__get8(s);
   1006    return (z << 8) + stbi__get8(s);
   1007 }
   1008 
   1009 static stbi__uint32 stbi__get32be(stbi__context *s)
   1010 {
   1011    stbi__uint32 z = stbi__get16be(s);
   1012    return (z << 16) + stbi__get16be(s);
   1013 }
   1014 
   1015 #if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
   1016 // nothing
   1017 #else
   1018 static int stbi__get16le(stbi__context *s)
   1019 {
   1020    int z = stbi__get8(s);
   1021    return z + (stbi__get8(s) << 8);
   1022 }
   1023 #endif
   1024 
   1025 #ifndef STBI_NO_BMP
   1026 static stbi__uint32 stbi__get32le(stbi__context *s)
   1027 {
   1028    stbi__uint32 z = stbi__get16le(s);
   1029    return z + (stbi__get16le(s) << 16);
   1030 }
   1031 #endif
   1032 
   1033 #define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
   1034 
   1035 
   1036 //////////////////////////////////////////////////////////////////////////////
   1037 //
   1038 //  generic converter from built-in img_n to req_comp
   1039 //    individual types do this automatically as much as possible (e.g. jpeg
   1040 //    does all cases internally since it needs to colorspace convert anyway,
   1041 //    and it never has alpha, so very few cases ). png can automatically
   1042 //    interleave an alpha=255 channel, but falls back to this for other cases
   1043 //
   1044 //  assume data buffer is malloced, so malloc a new one and free that one
   1045 //  only failure mode is malloc failing
   1046 
   1047 static stbi_uc stbi__compute_y(int r, int g, int b)
   1048 {
   1049    return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
   1050 }
   1051 
   1052 static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
   1053 {
   1054    int i,j;
   1055    unsigned char *good;
   1056 
   1057    if (req_comp == img_n) return data;
   1058    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
   1059 
   1060    good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
   1061    if (good == NULL) {
   1062       STBI_FREE(data);
   1063       return stbi__errpuc("outofmem", "Out of memory");
   1064    }
   1065 
   1066    for (j=0; j < (int) y; ++j) {
   1067       unsigned char *src  = data + j * x * img_n   ;
   1068       unsigned char *dest = good + j * x * req_comp;
   1069 
   1070       #define STBI__COMBO(a,b)  ((a)*8+(b))
   1071       #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
   1072       // convert source image with img_n components to one with req_comp components;
   1073       // avoid switch per pixel, so use switch per scanline and massive macros
   1074       switch (STBI__COMBO(img_n, req_comp)) {
   1075          STBI__CASE(1,2) { dest[0]=src[0], dest[1]=255;                                     } break;
   1076          STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
   1077          STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=255;                     } break;
   1078          STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
   1079          STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
   1080          STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];                  } break;
   1081          STBI__CASE(3,4) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255;        } break;
   1082          STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
   1083          STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = 255;    } break;
   1084          STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
   1085          STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; } break;
   1086          STBI__CASE(4,3) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];                    } break;
   1087          default: STBI_ASSERT(0);
   1088       }
   1089       #undef STBI__CASE
   1090    }
   1091 
   1092    STBI_FREE(data);
   1093    return good;
   1094 }
   1095 
   1096 static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
   1097 {
   1098    return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
   1099 }
   1100 
   1101 static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
   1102 {
   1103    int i,j;
   1104    stbi__uint16 *good;
   1105 
   1106    if (req_comp == img_n) return data;
   1107    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
   1108 
   1109    good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
   1110    if (good == NULL) {
   1111       STBI_FREE(data);
   1112       return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
   1113    }
   1114 
   1115    for (j=0; j < (int) y; ++j) {
   1116       stbi__uint16 *src  = data + j * x * img_n   ;
   1117       stbi__uint16 *dest = good + j * x * req_comp;
   1118 
   1119       #define STBI__COMBO(a,b)  ((a)*8+(b))
   1120       #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
   1121       // convert source image with img_n components to one with req_comp components;
   1122       // avoid switch per pixel, so use switch per scanline and massive macros
   1123       switch (STBI__COMBO(img_n, req_comp)) {
   1124          STBI__CASE(1,2) { dest[0]=src[0], dest[1]=0xffff;                                     } break;
   1125          STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
   1126          STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=0xffff;                     } break;
   1127          STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
   1128          STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
   1129          STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];                     } break;
   1130          STBI__CASE(3,4) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=0xffff;        } break;
   1131          STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
   1132          STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]), dest[1] = 0xffff; } break;
   1133          STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
   1134          STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]), dest[1] = src[3]; } break;
   1135          STBI__CASE(4,3) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];                       } break;
   1136          default: STBI_ASSERT(0);
   1137       }
   1138       #undef STBI__CASE
   1139    }
   1140 
   1141    STBI_FREE(data);
   1142    return good;
   1143 }
   1144 
   1145 #ifndef STBI_NO_LINEAR
   1146 static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
   1147 {
   1148    int i,k,n;
   1149    float *output;
   1150    if (!data) return NULL;
   1151    output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
   1152    if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
   1153    // compute number of non-alpha components
   1154    if (comp & 1) n = comp; else n = comp-1;
   1155    for (i=0; i < x*y; ++i) {
   1156       for (k=0; k < n; ++k) {
   1157          output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
   1158       }
   1159       if (k < comp) output[i*comp + k] = data[i*comp+k]/255.0f;
   1160    }
   1161    STBI_FREE(data);
   1162    return output;
   1163 }
   1164 #endif
   1165 
   1166 #ifndef STBI_NO_HDR
   1167 #define stbi__float2int(x)   ((int) (x))
   1168 static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
   1169 {
   1170    int i,k,n;
   1171    stbi_uc *output;
   1172    if (!data) return NULL;
   1173    output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
   1174    if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
   1175    // compute number of non-alpha components
   1176    if (comp & 1) n = comp; else n = comp-1;
   1177    for (i=0; i < x*y; ++i) {
   1178       for (k=0; k < n; ++k) {
   1179          float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
   1180          if (z < 0) z = 0;
   1181          if (z > 255) z = 255;
   1182          output[i*comp + k] = (stbi_uc) stbi__float2int(z);
   1183       }
   1184       if (k < comp) {
   1185          float z = data[i*comp+k] * 255 + 0.5f;
   1186          if (z < 0) z = 0;
   1187          if (z > 255) z = 255;
   1188          output[i*comp + k] = (stbi_uc) stbi__float2int(z);
   1189       }
   1190    }
   1191    STBI_FREE(data);
   1192    return output;
   1193 }
   1194 #endif
   1195 
   1196 //////////////////////////////////////////////////////////////////////////////
   1197 //
   1198 //  "baseline" JPEG/JFIF decoder
   1199 //
   1200 //    simple implementation
   1201 //      - doesn't support delayed output of y-dimension
   1202 //      - simple interface (only one output format: 8-bit interleaved RGB)
   1203 //      - doesn't try to recover corrupt jpegs
   1204 //      - doesn't allow partial loading, loading multiple at once
   1205 //      - still fast on x86 (copying globals into locals doesn't help x86)
   1206 //      - allocates lots of intermediate memory (full size of all components)
   1207 //        - non-interleaved case requires this anyway
   1208 //        - allows good upsampling (see next)
   1209 //    high-quality
   1210 //      - upsampled channels are bilinearly interpolated, even across blocks
   1211 //      - quality integer IDCT derived from IJG's 'slow'
   1212 //    performance
   1213 //      - fast huffman; reasonable integer IDCT
   1214 //      - some SIMD kernels for common paths on targets with SSE2/NEON
   1215 //      - uses a lot of intermediate memory, could cache poorly
   1216 
   1217 #ifndef STBI_NO_JPEG
   1218 
   1219 // huffman decoding acceleration
   1220 #define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
   1221 
   1222 typedef struct
   1223 {
   1224    stbi_uc  fast[1 << FAST_BITS];
   1225    // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
   1226    stbi__uint16 code[256];
   1227    stbi_uc  values[256];
   1228    stbi_uc  size[257];
   1229    unsigned int maxcode[18];
   1230    int    delta[17];   // old 'firstsymbol' - old 'firstcode'
   1231 } stbi__huffman;
   1232 
   1233 typedef struct
   1234 {
   1235    stbi__context *s;
   1236    stbi__huffman huff_dc[4];
   1237    stbi__huffman huff_ac[4];
   1238    stbi__uint16 dequant[4][64];
   1239    stbi__int16 fast_ac[4][1 << FAST_BITS];
   1240 
   1241 // sizes for components, interleaved MCUs
   1242    int img_h_max, img_v_max;
   1243    int img_mcu_x, img_mcu_y;
   1244    int img_mcu_w, img_mcu_h;
   1245 
   1246 // definition of jpeg image component
   1247    struct
   1248    {
   1249       int id;
   1250       int h,v;
   1251       int tq;
   1252       int hd,ha;
   1253       int dc_pred;
   1254 
   1255       int x,y,w2,h2;
   1256       stbi_uc *data;
   1257       void *raw_data, *raw_coeff;
   1258       stbi_uc *linebuf;
   1259       short   *coeff;   // progressive only
   1260       int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
   1261    } img_comp[4];
   1262 
   1263    stbi__uint32   code_buffer; // jpeg entropy-coded buffer
   1264    int            code_bits;   // number of valid bits
   1265    unsigned char  marker;      // marker seen while filling entropy buffer
   1266    int            nomore;      // flag if we saw a marker so must stop
   1267 
   1268    int            progressive;
   1269    int            spec_start;
   1270    int            spec_end;
   1271    int            succ_high;
   1272    int            succ_low;
   1273    int            eob_run;
   1274    int            jfif;
   1275    int            app14_color_transform; // Adobe APP14 tag
   1276    int            rgb;
   1277 
   1278    int scan_n, order[4];
   1279    int restart_interval, todo;
   1280 
   1281 // kernels
   1282    void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
   1283    void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
   1284    stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
   1285 } stbi__jpeg;
   1286 
   1287 static int stbi__build_huffman(stbi__huffman *h, int *count)
   1288 {
   1289    int i,j,k=0;
   1290    unsigned int code;
   1291    // build size list for each symbol (from JPEG spec)
   1292    for (i=0; i < 16; ++i)
   1293       for (j=0; j < count[i]; ++j)
   1294          h->size[k++] = (stbi_uc) (i+1);
   1295    h->size[k] = 0;
   1296 
   1297    // compute actual symbols (from jpeg spec)
   1298    code = 0;
   1299    k = 0;
   1300    for(j=1; j <= 16; ++j) {
   1301       // compute delta to add to code to compute symbol id
   1302       h->delta[j] = k - code;
   1303       if (h->size[k] == j) {
   1304          while (h->size[k] == j)
   1305             h->code[k++] = (stbi__uint16) (code++);
   1306          if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
   1307       }
   1308       // compute largest code + 1 for this size, preshifted as needed later
   1309       h->maxcode[j] = code << (16-j);
   1310       code <<= 1;
   1311    }
   1312    h->maxcode[j] = 0xffffffff;
   1313 
   1314    // build non-spec acceleration table; 255 is flag for not-accelerated
   1315    memset(h->fast, 255, 1 << FAST_BITS);
   1316    for (i=0; i < k; ++i) {
   1317       int s = h->size[i];
   1318       if (s <= FAST_BITS) {
   1319          int c = h->code[i] << (FAST_BITS-s);
   1320          int m = 1 << (FAST_BITS-s);
   1321          for (j=0; j < m; ++j) {
   1322             h->fast[c+j] = (stbi_uc) i;
   1323          }
   1324       }
   1325    }
   1326    return 1;
   1327 }
   1328 
   1329 // build a table that decodes both magnitude and value of small ACs in
   1330 // one go.
   1331 static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
   1332 {
   1333    int i;
   1334    for (i=0; i < (1 << FAST_BITS); ++i) {
   1335       stbi_uc fast = h->fast[i];
   1336       fast_ac[i] = 0;
   1337       if (fast < 255) {
   1338          int rs = h->values[fast];
   1339          int run = (rs >> 4) & 15;
   1340          int magbits = rs & 15;
   1341          int len = h->size[fast];
   1342 
   1343          if (magbits && len + magbits <= FAST_BITS) {
   1344             // magnitude code followed by receive_extend code
   1345             int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
   1346             int m = 1 << (magbits - 1);
   1347             if (k < m) k += (~0U << magbits) + 1;
   1348             // if the result is small enough, we can fit it in fast_ac table
   1349             if (k >= -128 && k <= 127)
   1350                fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
   1351          }
   1352       }
   1353    }
   1354 }
   1355 
   1356 static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
   1357 {
   1358    do {
   1359       unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
   1360       if (b == 0xff) {
   1361          int c = stbi__get8(j->s);
   1362          while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
   1363          if (c != 0) {
   1364             j->marker = (unsigned char) c;
   1365             j->nomore = 1;
   1366             return;
   1367          }
   1368       }
   1369       j->code_buffer |= b << (24 - j->code_bits);
   1370       j->code_bits += 8;
   1371    } while (j->code_bits <= 24);
   1372 }
   1373 
   1374 // (1 << n) - 1
   1375 static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
   1376 
   1377 // decode a jpeg huffman value from the bitstream
   1378 stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
   1379 {
   1380    unsigned int temp;
   1381    int c,k;
   1382 
   1383    if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
   1384 
   1385    // look at the top FAST_BITS and determine what symbol ID it is,
   1386    // if the code is <= FAST_BITS
   1387    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
   1388    k = h->fast[c];
   1389    if (k < 255) {
   1390       int s = h->size[k];
   1391       if (s > j->code_bits)
   1392          return -1;
   1393       j->code_buffer <<= s;
   1394       j->code_bits -= s;
   1395       return h->values[k];
   1396    }
   1397 
   1398    // naive test is to shift the code_buffer down so k bits are
   1399    // valid, then test against maxcode. To speed this up, we've
   1400    // preshifted maxcode left so that it has (16-k) 0s at the
   1401    // end; in other words, regardless of the number of bits, it
   1402    // wants to be compared against something shifted to have 16;
   1403    // that way we don't need to shift inside the loop.
   1404    temp = j->code_buffer >> 16;
   1405    for (k=FAST_BITS+1 ; ; ++k)
   1406       if (temp < h->maxcode[k])
   1407          break;
   1408    if (k == 17) {
   1409       // error! code not found
   1410       j->code_bits -= 16;
   1411       return -1;
   1412    }
   1413 
   1414    if (k > j->code_bits)
   1415       return -1;
   1416 
   1417    // convert the huffman code to the symbol id
   1418    c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
   1419    STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
   1420 
   1421    // convert the id to a symbol
   1422    j->code_bits -= k;
   1423    j->code_buffer <<= k;
   1424    return h->values[c];
   1425 }
   1426 
   1427 // bias[n] = (-1<<n) + 1
   1428 static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
   1429 
   1430 // combined JPEG 'receive' and JPEG 'extend', since baseline
   1431 // always extends everything it receives.
   1432 stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
   1433 {
   1434    unsigned int k;
   1435    int sgn;
   1436    if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
   1437 
   1438    sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
   1439    k = stbi_lrot(j->code_buffer, n);
   1440    STBI_ASSERT(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask)));
   1441    j->code_buffer = k & ~stbi__bmask[n];
   1442    k &= stbi__bmask[n];
   1443    j->code_bits -= n;
   1444    return k + (stbi__jbias[n] & ~sgn);
   1445 }
   1446 
   1447 // get some unsigned bits
   1448 stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
   1449 {
   1450    unsigned int k;
   1451    if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
   1452    k = stbi_lrot(j->code_buffer, n);
   1453    j->code_buffer = k & ~stbi__bmask[n];
   1454    k &= stbi__bmask[n];
   1455    j->code_bits -= n;
   1456    return k;
   1457 }
   1458 
   1459 stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
   1460 {
   1461    unsigned int k;
   1462    if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
   1463    k = j->code_buffer;
   1464    j->code_buffer <<= 1;
   1465    --j->code_bits;
   1466    return k & 0x80000000;
   1467 }
   1468 
   1469 // given a value that's at position X in the zigzag stream,
   1470 // where does it appear in the 8x8 matrix coded as row-major?
   1471 static const stbi_uc stbi__jpeg_dezigzag[64+15] =
   1472 {
   1473     0,  1,  8, 16,  9,  2,  3, 10,
   1474    17, 24, 32, 25, 18, 11,  4,  5,
   1475    12, 19, 26, 33, 40, 48, 41, 34,
   1476    27, 20, 13,  6,  7, 14, 21, 28,
   1477    35, 42, 49, 56, 57, 50, 43, 36,
   1478    29, 22, 15, 23, 30, 37, 44, 51,
   1479    58, 59, 52, 45, 38, 31, 39, 46,
   1480    53, 60, 61, 54, 47, 55, 62, 63,
   1481    // let corrupt input sample past end
   1482    63, 63, 63, 63, 63, 63, 63, 63,
   1483    63, 63, 63, 63, 63, 63, 63
   1484 };
   1485 
   1486 // decode one 64-entry block--
   1487 static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
   1488 {
   1489    int diff,dc,k;
   1490    int t;
   1491 
   1492    if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
   1493    t = stbi__jpeg_huff_decode(j, hdc);
   1494    if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
   1495 
   1496    // 0 all the ac values now so we can do it 32-bits at a time
   1497    memset(data,0,64*sizeof(data[0]));
   1498 
   1499    diff = t ? stbi__extend_receive(j, t) : 0;
   1500    dc = j->img_comp[b].dc_pred + diff;
   1501    j->img_comp[b].dc_pred = dc;
   1502    data[0] = (short) (dc * dequant[0]);
   1503 
   1504    // decode AC components, see JPEG spec
   1505    k = 1;
   1506    do {
   1507       unsigned int zig;
   1508       int c,r,s;
   1509       if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
   1510       c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
   1511       r = fac[c];
   1512       if (r) { // fast-AC path
   1513          k += (r >> 4) & 15; // run
   1514          s = r & 15; // combined length
   1515          j->code_buffer <<= s;
   1516          j->code_bits -= s;
   1517          // decode into unzigzag'd location
   1518          zig = stbi__jpeg_dezigzag[k++];
   1519          data[zig] = (short) ((r >> 8) * dequant[zig]);
   1520       } else {
   1521          int rs = stbi__jpeg_huff_decode(j, hac);
   1522          if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
   1523          s = rs & 15;
   1524          r = rs >> 4;
   1525          if (s == 0) {
   1526             if (rs != 0xf0) break; // end block
   1527             k += 16;
   1528          } else {
   1529             k += r;
   1530             // decode into unzigzag'd location
   1531             zig = stbi__jpeg_dezigzag[k++];
   1532             data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
   1533          }
   1534       }
   1535    } while (k < 64);
   1536    return 1;
   1537 }
   1538 
   1539 static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
   1540 {
   1541    int diff,dc;
   1542    int t;
   1543    if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
   1544 
   1545    if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
   1546 
   1547    if (j->succ_high == 0) {
   1548       // first scan for DC coefficient, must be first
   1549       memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
   1550       t = stbi__jpeg_huff_decode(j, hdc);
   1551       diff = t ? stbi__extend_receive(j, t) : 0;
   1552 
   1553       dc = j->img_comp[b].dc_pred + diff;
   1554       j->img_comp[b].dc_pred = dc;
   1555       data[0] = (short) (dc << j->succ_low);
   1556    } else {
   1557       // refinement scan for DC coefficient
   1558       if (stbi__jpeg_get_bit(j))
   1559          data[0] += (short) (1 << j->succ_low);
   1560    }
   1561    return 1;
   1562 }
   1563 
   1564 // @OPTIMIZE: store non-zigzagged during the decode passes,
   1565 // and only de-zigzag when dequantizing
   1566 static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
   1567 {
   1568    int k;
   1569    if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
   1570 
   1571    if (j->succ_high == 0) {
   1572       int shift = j->succ_low;
   1573 
   1574       if (j->eob_run) {
   1575          --j->eob_run;
   1576          return 1;
   1577       }
   1578 
   1579       k = j->spec_start;
   1580       do {
   1581          unsigned int zig;
   1582          int c,r,s;
   1583          if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
   1584          c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
   1585          r = fac[c];
   1586          if (r) { // fast-AC path
   1587             k += (r >> 4) & 15; // run
   1588             s = r & 15; // combined length
   1589             j->code_buffer <<= s;
   1590             j->code_bits -= s;
   1591             zig = stbi__jpeg_dezigzag[k++];
   1592             data[zig] = (short) ((r >> 8) << shift);
   1593          } else {
   1594             int rs = stbi__jpeg_huff_decode(j, hac);
   1595             if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
   1596             s = rs & 15;
   1597             r = rs >> 4;
   1598             if (s == 0) {
   1599                if (r < 15) {
   1600                   j->eob_run = (1 << r);
   1601                   if (r)
   1602                      j->eob_run += stbi__jpeg_get_bits(j, r);
   1603                   --j->eob_run;
   1604                   break;
   1605                }
   1606                k += 16;
   1607             } else {
   1608                k += r;
   1609                zig = stbi__jpeg_dezigzag[k++];
   1610                data[zig] = (short) (stbi__extend_receive(j,s) << shift);
   1611             }
   1612          }
   1613       } while (k <= j->spec_end);
   1614    } else {
   1615       // refinement scan for these AC coefficients
   1616 
   1617       short bit = (short) (1 << j->succ_low);
   1618 
   1619       if (j->eob_run) {
   1620          --j->eob_run;
   1621          for (k = j->spec_start; k <= j->spec_end; ++k) {
   1622             short *p = &data[stbi__jpeg_dezigzag[k]];
   1623             if (*p != 0)
   1624                if (stbi__jpeg_get_bit(j))
   1625                   if ((*p & bit)==0) {
   1626                      if (*p > 0)
   1627                         *p += bit;
   1628                      else
   1629                         *p -= bit;
   1630                   }
   1631          }
   1632       } else {
   1633          k = j->spec_start;
   1634          do {
   1635             int r,s;
   1636             int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
   1637             if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
   1638             s = rs & 15;
   1639             r = rs >> 4;
   1640             if (s == 0) {
   1641                if (r < 15) {
   1642                   j->eob_run = (1 << r) - 1;
   1643                   if (r)
   1644                      j->eob_run += stbi__jpeg_get_bits(j, r);
   1645                   r = 64; // force end of block
   1646                } else {
   1647                   // r=15 s=0 should write 16 0s, so we just do
   1648                   // a run of 15 0s and then write s (which is 0),
   1649                   // so we don't have to do anything special here
   1650                }
   1651             } else {
   1652                if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
   1653                // sign bit
   1654                if (stbi__jpeg_get_bit(j))
   1655                   s = bit;
   1656                else
   1657                   s = -bit;
   1658             }
   1659 
   1660             // advance by r
   1661             while (k <= j->spec_end) {
   1662                short *p = &data[stbi__jpeg_dezigzag[k++]];
   1663                if (*p != 0) {
   1664                   if (stbi__jpeg_get_bit(j))
   1665                      if ((*p & bit)==0) {
   1666                         if (*p > 0)
   1667                            *p += bit;
   1668                         else
   1669                            *p -= bit;
   1670                      }
   1671                } else {
   1672                   if (r == 0) {
   1673                      *p = (short) s;
   1674                      break;
   1675                   }
   1676                   --r;
   1677                }
   1678             }
   1679          } while (k <= j->spec_end);
   1680       }
   1681    }
   1682    return 1;
   1683 }
   1684 
   1685 // take a -128..127 value and stbi__clamp it and convert to 0..255
   1686 stbi_inline static stbi_uc stbi__clamp(int x)
   1687 {
   1688    // trick to use a single test to catch both cases
   1689    if ((unsigned int) x > 255) {
   1690       if (x < 0) return 0;
   1691       if (x > 255) return 255;
   1692    }
   1693    return (stbi_uc) x;
   1694 }
   1695 
   1696 #define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
   1697 #define stbi__fsh(x)  ((x) * 4096)
   1698 
   1699 // derived from jidctint -- DCT_ISLOW
   1700 #define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
   1701    int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
   1702    p2 = s2;                                    \
   1703    p3 = s6;                                    \
   1704    p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
   1705    t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
   1706    t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
   1707    p2 = s0;                                    \
   1708    p3 = s4;                                    \
   1709    t0 = stbi__fsh(p2+p3);                      \
   1710    t1 = stbi__fsh(p2-p3);                      \
   1711    x0 = t0+t3;                                 \
   1712    x3 = t0-t3;                                 \
   1713    x1 = t1+t2;                                 \
   1714    x2 = t1-t2;                                 \
   1715    t0 = s7;                                    \
   1716    t1 = s5;                                    \
   1717    t2 = s3;                                    \
   1718    t3 = s1;                                    \
   1719    p3 = t0+t2;                                 \
   1720    p4 = t1+t3;                                 \
   1721    p1 = t0+t3;                                 \
   1722    p2 = t1+t2;                                 \
   1723    p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
   1724    t0 = t0*stbi__f2f( 0.298631336f);           \
   1725    t1 = t1*stbi__f2f( 2.053119869f);           \
   1726    t2 = t2*stbi__f2f( 3.072711026f);           \
   1727    t3 = t3*stbi__f2f( 1.501321110f);           \
   1728    p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
   1729    p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
   1730    p3 = p3*stbi__f2f(-1.961570560f);           \
   1731    p4 = p4*stbi__f2f(-0.390180644f);           \
   1732    t3 += p1+p4;                                \
   1733    t2 += p2+p3;                                \
   1734    t1 += p2+p4;                                \
   1735    t0 += p1+p3;
   1736 
   1737 static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
   1738 {
   1739    int i,val[64],*v=val;
   1740    stbi_uc *o;
   1741    short *d = data;
   1742 
   1743    // columns
   1744    for (i=0; i < 8; ++i,++d, ++v) {
   1745       // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
   1746       if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
   1747            && d[40]==0 && d[48]==0 && d[56]==0) {
   1748          //    no shortcut                 0     seconds
   1749          //    (1|2|3|4|5|6|7)==0          0     seconds
   1750          //    all separate               -0.047 seconds
   1751          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
   1752          int dcterm = d[0]*4;
   1753          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
   1754       } else {
   1755          STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
   1756          // constants scaled things up by 1<<12; let's bring them back
   1757          // down, but keep 2 extra bits of precision
   1758          x0 += 512; x1 += 512; x2 += 512; x3 += 512;
   1759          v[ 0] = (x0+t3) >> 10;
   1760          v[56] = (x0-t3) >> 10;
   1761          v[ 8] = (x1+t2) >> 10;
   1762          v[48] = (x1-t2) >> 10;
   1763          v[16] = (x2+t1) >> 10;
   1764          v[40] = (x2-t1) >> 10;
   1765          v[24] = (x3+t0) >> 10;
   1766          v[32] = (x3-t0) >> 10;
   1767       }
   1768    }
   1769 
   1770    for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
   1771       // no fast case since the first 1D IDCT spread components out
   1772       STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
   1773       // constants scaled things up by 1<<12, plus we had 1<<2 from first
   1774       // loop, plus horizontal and vertical each scale by sqrt(8) so together
   1775       // we've got an extra 1<<3, so 1<<17 total we need to remove.
   1776       // so we want to round that, which means adding 0.5 * 1<<17,
   1777       // aka 65536. Also, we'll end up with -128 to 127 that we want
   1778       // to encode as 0..255 by adding 128, so we'll add that before the shift
   1779       x0 += 65536 + (128<<17);
   1780       x1 += 65536 + (128<<17);
   1781       x2 += 65536 + (128<<17);
   1782       x3 += 65536 + (128<<17);
   1783       // tried computing the shifts into temps, or'ing the temps to see
   1784       // if any were out of range, but that was slower
   1785       o[0] = stbi__clamp((x0+t3) >> 17);
   1786       o[7] = stbi__clamp((x0-t3) >> 17);
   1787       o[1] = stbi__clamp((x1+t2) >> 17);
   1788       o[6] = stbi__clamp((x1-t2) >> 17);
   1789       o[2] = stbi__clamp((x2+t1) >> 17);
   1790       o[5] = stbi__clamp((x2-t1) >> 17);
   1791       o[3] = stbi__clamp((x3+t0) >> 17);
   1792       o[4] = stbi__clamp((x3-t0) >> 17);
   1793    }
   1794 }
   1795 
   1796 #ifdef STBI_SSE2
   1797 // sse2 integer IDCT. not the fastest possible implementation but it
   1798 // produces bit-identical results to the generic C version so it's
   1799 // fully "transparent".
   1800 static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
   1801 {
   1802    // This is constructed to match our regular (generic) integer IDCT exactly.
   1803    __m128i row0, row1, row2, row3, row4, row5, row6, row7;
   1804    __m128i tmp;
   1805 
   1806    // dot product constant: even elems=x, odd elems=y
   1807    #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
   1808 
   1809    // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
   1810    // out(1) = c1[even]*x + c1[odd]*y
   1811    #define dct_rot(out0,out1, x,y,c0,c1) \
   1812       __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
   1813       __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
   1814       __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
   1815       __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
   1816       __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
   1817       __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
   1818 
   1819    // out = in << 12  (in 16-bit, out 32-bit)
   1820    #define dct_widen(out, in) \
   1821       __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
   1822       __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
   1823 
   1824    // wide add
   1825    #define dct_wadd(out, a, b) \
   1826       __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
   1827       __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
   1828 
   1829    // wide sub
   1830    #define dct_wsub(out, a, b) \
   1831       __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
   1832       __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
   1833 
   1834    // butterfly a/b, add bias, then shift by "s" and pack
   1835    #define dct_bfly32o(out0, out1, a,b,bias,s) \
   1836       { \
   1837          __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
   1838          __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
   1839          dct_wadd(sum, abiased, b); \
   1840          dct_wsub(dif, abiased, b); \
   1841          out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
   1842          out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
   1843       }
   1844 
   1845    // 8-bit interleave step (for transposes)
   1846    #define dct_interleave8(a, b) \
   1847       tmp = a; \
   1848       a = _mm_unpacklo_epi8(a, b); \
   1849       b = _mm_unpackhi_epi8(tmp, b)
   1850 
   1851    // 16-bit interleave step (for transposes)
   1852    #define dct_interleave16(a, b) \
   1853       tmp = a; \
   1854       a = _mm_unpacklo_epi16(a, b); \
   1855       b = _mm_unpackhi_epi16(tmp, b)
   1856 
   1857    #define dct_pass(bias,shift) \
   1858       { \
   1859          /* even part */ \
   1860          dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
   1861          __m128i sum04 = _mm_add_epi16(row0, row4); \
   1862          __m128i dif04 = _mm_sub_epi16(row0, row4); \
   1863          dct_widen(t0e, sum04); \
   1864          dct_widen(t1e, dif04); \
   1865          dct_wadd(x0, t0e, t3e); \
   1866          dct_wsub(x3, t0e, t3e); \
   1867          dct_wadd(x1, t1e, t2e); \
   1868          dct_wsub(x2, t1e, t2e); \
   1869          /* odd part */ \
   1870          dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
   1871          dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
   1872          __m128i sum17 = _mm_add_epi16(row1, row7); \
   1873          __m128i sum35 = _mm_add_epi16(row3, row5); \
   1874          dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
   1875          dct_wadd(x4, y0o, y4o); \
   1876          dct_wadd(x5, y1o, y5o); \
   1877          dct_wadd(x6, y2o, y5o); \
   1878          dct_wadd(x7, y3o, y4o); \
   1879          dct_bfly32o(row0,row7, x0,x7,bias,shift); \
   1880          dct_bfly32o(row1,row6, x1,x6,bias,shift); \
   1881          dct_bfly32o(row2,row5, x2,x5,bias,shift); \
   1882          dct_bfly32o(row3,row4, x3,x4,bias,shift); \
   1883       }
   1884 
   1885    __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
   1886    __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
   1887    __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
   1888    __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
   1889    __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
   1890    __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
   1891    __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
   1892    __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
   1893 
   1894    // rounding biases in column/row passes, see stbi__idct_block for explanation.
   1895    __m128i bias_0 = _mm_set1_epi32(512);
   1896    __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
   1897 
   1898    // load
   1899    row0 = _mm_load_si128((const __m128i *) (data + 0*8));
   1900    row1 = _mm_load_si128((const __m128i *) (data + 1*8));
   1901    row2 = _mm_load_si128((const __m128i *) (data + 2*8));
   1902    row3 = _mm_load_si128((const __m128i *) (data + 3*8));
   1903    row4 = _mm_load_si128((const __m128i *) (data + 4*8));
   1904    row5 = _mm_load_si128((const __m128i *) (data + 5*8));
   1905    row6 = _mm_load_si128((const __m128i *) (data + 6*8));
   1906    row7 = _mm_load_si128((const __m128i *) (data + 7*8));
   1907 
   1908    // column pass
   1909    dct_pass(bias_0, 10);
   1910 
   1911    {
   1912       // 16bit 8x8 transpose pass 1
   1913       dct_interleave16(row0, row4);
   1914       dct_interleave16(row1, row5);
   1915       dct_interleave16(row2, row6);
   1916       dct_interleave16(row3, row7);
   1917 
   1918       // transpose pass 2
   1919       dct_interleave16(row0, row2);
   1920       dct_interleave16(row1, row3);
   1921       dct_interleave16(row4, row6);
   1922       dct_interleave16(row5, row7);
   1923 
   1924       // transpose pass 3
   1925       dct_interleave16(row0, row1);
   1926       dct_interleave16(row2, row3);
   1927       dct_interleave16(row4, row5);
   1928       dct_interleave16(row6, row7);
   1929    }
   1930 
   1931    // row pass
   1932    dct_pass(bias_1, 17);
   1933 
   1934    {
   1935       // pack
   1936       __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
   1937       __m128i p1 = _mm_packus_epi16(row2, row3);
   1938       __m128i p2 = _mm_packus_epi16(row4, row5);
   1939       __m128i p3 = _mm_packus_epi16(row6, row7);
   1940 
   1941       // 8bit 8x8 transpose pass 1
   1942       dct_interleave8(p0, p2); // a0e0a1e1...
   1943       dct_interleave8(p1, p3); // c0g0c1g1...
   1944 
   1945       // transpose pass 2
   1946       dct_interleave8(p0, p1); // a0c0e0g0...
   1947       dct_interleave8(p2, p3); // b0d0f0h0...
   1948 
   1949       // transpose pass 3
   1950       dct_interleave8(p0, p2); // a0b0c0d0...
   1951       dct_interleave8(p1, p3); // a4b4c4d4...
   1952 
   1953       // store
   1954       _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
   1955       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
   1956       _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
   1957       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
   1958       _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
   1959       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
   1960       _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
   1961       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
   1962    }
   1963 
   1964 #undef dct_const
   1965 #undef dct_rot
   1966 #undef dct_widen
   1967 #undef dct_wadd
   1968 #undef dct_wsub
   1969 #undef dct_bfly32o
   1970 #undef dct_interleave8
   1971 #undef dct_interleave16
   1972 #undef dct_pass
   1973 }
   1974 
   1975 #endif // STBI_SSE2
   1976 
   1977 #ifdef STBI_NEON
   1978 
   1979 // NEON integer IDCT. should produce bit-identical
   1980 // results to the generic C version.
   1981 static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
   1982 {
   1983    int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
   1984 
   1985    int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
   1986    int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
   1987    int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
   1988    int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
   1989    int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
   1990    int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
   1991    int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
   1992    int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
   1993    int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
   1994    int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
   1995    int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
   1996    int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
   1997 
   1998 #define dct_long_mul(out, inq, coeff) \
   1999    int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
   2000    int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
   2001 
   2002 #define dct_long_mac(out, acc, inq, coeff) \
   2003    int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
   2004    int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
   2005 
   2006 #define dct_widen(out, inq) \
   2007    int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
   2008    int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
   2009 
   2010 // wide add
   2011 #define dct_wadd(out, a, b) \
   2012    int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
   2013    int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
   2014 
   2015 // wide sub
   2016 #define dct_wsub(out, a, b) \
   2017    int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
   2018    int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
   2019 
   2020 // butterfly a/b, then shift using "shiftop" by "s" and pack
   2021 #define dct_bfly32o(out0,out1, a,b,shiftop,s) \
   2022    { \
   2023       dct_wadd(sum, a, b); \
   2024       dct_wsub(dif, a, b); \
   2025       out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
   2026       out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
   2027    }
   2028 
   2029 #define dct_pass(shiftop, shift) \
   2030    { \
   2031       /* even part */ \
   2032       int16x8_t sum26 = vaddq_s16(row2, row6); \
   2033       dct_long_mul(p1e, sum26, rot0_0); \
   2034       dct_long_mac(t2e, p1e, row6, rot0_1); \
   2035       dct_long_mac(t3e, p1e, row2, rot0_2); \
   2036       int16x8_t sum04 = vaddq_s16(row0, row4); \
   2037       int16x8_t dif04 = vsubq_s16(row0, row4); \
   2038       dct_widen(t0e, sum04); \
   2039       dct_widen(t1e, dif04); \
   2040       dct_wadd(x0, t0e, t3e); \
   2041       dct_wsub(x3, t0e, t3e); \
   2042       dct_wadd(x1, t1e, t2e); \
   2043       dct_wsub(x2, t1e, t2e); \
   2044       /* odd part */ \
   2045       int16x8_t sum15 = vaddq_s16(row1, row5); \
   2046       int16x8_t sum17 = vaddq_s16(row1, row7); \
   2047       int16x8_t sum35 = vaddq_s16(row3, row5); \
   2048       int16x8_t sum37 = vaddq_s16(row3, row7); \
   2049       int16x8_t sumodd = vaddq_s16(sum17, sum35); \
   2050       dct_long_mul(p5o, sumodd, rot1_0); \
   2051       dct_long_mac(p1o, p5o, sum17, rot1_1); \
   2052       dct_long_mac(p2o, p5o, sum35, rot1_2); \
   2053       dct_long_mul(p3o, sum37, rot2_0); \
   2054       dct_long_mul(p4o, sum15, rot2_1); \
   2055       dct_wadd(sump13o, p1o, p3o); \
   2056       dct_wadd(sump24o, p2o, p4o); \
   2057       dct_wadd(sump23o, p2o, p3o); \
   2058       dct_wadd(sump14o, p1o, p4o); \
   2059       dct_long_mac(x4, sump13o, row7, rot3_0); \
   2060       dct_long_mac(x5, sump24o, row5, rot3_1); \
   2061       dct_long_mac(x6, sump23o, row3, rot3_2); \
   2062       dct_long_mac(x7, sump14o, row1, rot3_3); \
   2063       dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
   2064       dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
   2065       dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
   2066       dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
   2067    }
   2068 
   2069    // load
   2070    row0 = vld1q_s16(data + 0*8);
   2071    row1 = vld1q_s16(data + 1*8);
   2072    row2 = vld1q_s16(data + 2*8);
   2073    row3 = vld1q_s16(data + 3*8);
   2074    row4 = vld1q_s16(data + 4*8);
   2075    row5 = vld1q_s16(data + 5*8);
   2076    row6 = vld1q_s16(data + 6*8);
   2077    row7 = vld1q_s16(data + 7*8);
   2078 
   2079    // add DC bias
   2080    row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
   2081 
   2082    // column pass
   2083    dct_pass(vrshrn_n_s32, 10);
   2084 
   2085    // 16bit 8x8 transpose
   2086    {
   2087 // these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
   2088 // whether compilers actually get this is another story, sadly.
   2089 #define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
   2090 #define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
   2091 #define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
   2092 
   2093       // pass 1
   2094       dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
   2095       dct_trn16(row2, row3);
   2096       dct_trn16(row4, row5);
   2097       dct_trn16(row6, row7);
   2098 
   2099       // pass 2
   2100       dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
   2101       dct_trn32(row1, row3);
   2102       dct_trn32(row4, row6);
   2103       dct_trn32(row5, row7);
   2104 
   2105       // pass 3
   2106       dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
   2107       dct_trn64(row1, row5);
   2108       dct_trn64(row2, row6);
   2109       dct_trn64(row3, row7);
   2110 
   2111 #undef dct_trn16
   2112 #undef dct_trn32
   2113 #undef dct_trn64
   2114    }
   2115 
   2116    // row pass
   2117    // vrshrn_n_s32 only supports shifts up to 16, we need
   2118    // 17. so do a non-rounding shift of 16 first then follow
   2119    // up with a rounding shift by 1.
   2120    dct_pass(vshrn_n_s32, 16);
   2121 
   2122    {
   2123       // pack and round
   2124       uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
   2125       uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
   2126       uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
   2127       uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
   2128       uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
   2129       uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
   2130       uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
   2131       uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
   2132 
   2133       // again, these can translate into one instruction, but often don't.
   2134 #define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
   2135 #define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
   2136 #define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
   2137 
   2138       // sadly can't use interleaved stores here since we only write
   2139       // 8 bytes to each scan line!
   2140 
   2141       // 8x8 8-bit transpose pass 1
   2142       dct_trn8_8(p0, p1);
   2143       dct_trn8_8(p2, p3);
   2144       dct_trn8_8(p4, p5);
   2145       dct_trn8_8(p6, p7);
   2146 
   2147       // pass 2
   2148       dct_trn8_16(p0, p2);
   2149       dct_trn8_16(p1, p3);
   2150       dct_trn8_16(p4, p6);
   2151       dct_trn8_16(p5, p7);
   2152 
   2153       // pass 3
   2154       dct_trn8_32(p0, p4);
   2155       dct_trn8_32(p1, p5);
   2156       dct_trn8_32(p2, p6);
   2157       dct_trn8_32(p3, p7);
   2158 
   2159       // store
   2160       vst1_u8(out, p0); out += out_stride;
   2161       vst1_u8(out, p1); out += out_stride;
   2162       vst1_u8(out, p2); out += out_stride;
   2163       vst1_u8(out, p3); out += out_stride;
   2164       vst1_u8(out, p4); out += out_stride;
   2165       vst1_u8(out, p5); out += out_stride;
   2166       vst1_u8(out, p6); out += out_stride;
   2167       vst1_u8(out, p7);
   2168 
   2169 #undef dct_trn8_8
   2170 #undef dct_trn8_16
   2171 #undef dct_trn8_32
   2172    }
   2173 
   2174 #undef dct_long_mul
   2175 #undef dct_long_mac
   2176 #undef dct_widen
   2177 #undef dct_wadd
   2178 #undef dct_wsub
   2179 #undef dct_bfly32o
   2180 #undef dct_pass
   2181 }
   2182 
   2183 #endif // STBI_NEON
   2184 
   2185 #define STBI__MARKER_none  0xff
   2186 // if there's a pending marker from the entropy stream, return that
   2187 // otherwise, fetch from the stream and get a marker. if there's no
   2188 // marker, return 0xff, which is never a valid marker value
   2189 static stbi_uc stbi__get_marker(stbi__jpeg *j)
   2190 {
   2191    stbi_uc x;
   2192    if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
   2193    x = stbi__get8(j->s);
   2194    if (x != 0xff) return STBI__MARKER_none;
   2195    while (x == 0xff)
   2196       x = stbi__get8(j->s); // consume repeated 0xff fill bytes
   2197    return x;
   2198 }
   2199 
   2200 // in each scan, we'll have scan_n components, and the order
   2201 // of the components is specified by order[]
   2202 #define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
   2203 
   2204 // after a restart interval, stbi__jpeg_reset the entropy decoder and
   2205 // the dc prediction
   2206 static void stbi__jpeg_reset(stbi__jpeg *j)
   2207 {
   2208    j->code_bits = 0;
   2209    j->code_buffer = 0;
   2210    j->nomore = 0;
   2211    j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
   2212    j->marker = STBI__MARKER_none;
   2213    j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
   2214    j->eob_run = 0;
   2215    // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
   2216    // since we don't even allow 1<<30 pixels
   2217 }
   2218 
   2219 static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
   2220 {
   2221    stbi__jpeg_reset(z);
   2222    if (!z->progressive) {
   2223       if (z->scan_n == 1) {
   2224          int i,j;
   2225          STBI_SIMD_ALIGN(short, data[64]);
   2226          int n = z->order[0];
   2227          // non-interleaved data, we just need to process one block at a time,
   2228          // in trivial scanline order
   2229          // number of blocks to do just depends on how many actual "pixels" this
   2230          // component has, independent of interleaved MCU blocking and such
   2231          int w = (z->img_comp[n].x+7) >> 3;
   2232          int h = (z->img_comp[n].y+7) >> 3;
   2233          for (j=0; j < h; ++j) {
   2234             for (i=0; i < w; ++i) {
   2235                int ha = z->img_comp[n].ha;
   2236                if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
   2237                z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
   2238                // every data block is an MCU, so countdown the restart interval
   2239                if (--z->todo <= 0) {
   2240                   if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
   2241                   // if it's NOT a restart, then just bail, so we get corrupt data
   2242                   // rather than no data
   2243                   if (!STBI__RESTART(z->marker)) return 1;
   2244                   stbi__jpeg_reset(z);
   2245                }
   2246             }
   2247          }
   2248          return 1;
   2249       } else { // interleaved
   2250          int i,j,k,x,y;
   2251          STBI_SIMD_ALIGN(short, data[64]);
   2252          for (j=0; j < z->img_mcu_y; ++j) {
   2253             for (i=0; i < z->img_mcu_x; ++i) {
   2254                // scan an interleaved mcu... process scan_n components in order
   2255                for (k=0; k < z->scan_n; ++k) {
   2256                   int n = z->order[k];
   2257                   // scan out an mcu's worth of this component; that's just determined
   2258                   // by the basic H and V specified for the component
   2259                   for (y=0; y < z->img_comp[n].v; ++y) {
   2260                      for (x=0; x < z->img_comp[n].h; ++x) {
   2261                         int x2 = (i*z->img_comp[n].h + x)*8;
   2262                         int y2 = (j*z->img_comp[n].v + y)*8;
   2263                         int ha = z->img_comp[n].ha;
   2264                         if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
   2265                         z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
   2266                      }
   2267                   }
   2268                }
   2269                // after all interleaved components, that's an interleaved MCU,
   2270                // so now count down the restart interval
   2271                if (--z->todo <= 0) {
   2272                   if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
   2273                   if (!STBI__RESTART(z->marker)) return 1;
   2274                   stbi__jpeg_reset(z);
   2275                }
   2276             }
   2277          }
   2278          return 1;
   2279       }
   2280    } else {
   2281       if (z->scan_n == 1) {
   2282          int i,j;
   2283          int n = z->order[0];
   2284          // non-interleaved data, we just need to process one block at a time,
   2285          // in trivial scanline order
   2286          // number of blocks to do just depends on how many actual "pixels" this
   2287          // component has, independent of interleaved MCU blocking and such
   2288          int w = (z->img_comp[n].x+7) >> 3;
   2289          int h = (z->img_comp[n].y+7) >> 3;
   2290          for (j=0; j < h; ++j) {
   2291             for (i=0; i < w; ++i) {
   2292                short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
   2293                if (z->spec_start == 0) {
   2294                   if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
   2295                      return 0;
   2296                } else {
   2297                   int ha = z->img_comp[n].ha;
   2298                   if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
   2299                      return 0;
   2300                }
   2301                // every data block is an MCU, so countdown the restart interval
   2302                if (--z->todo <= 0) {
   2303                   if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
   2304                   if (!STBI__RESTART(z->marker)) return 1;
   2305                   stbi__jpeg_reset(z);
   2306                }
   2307             }
   2308          }
   2309          return 1;
   2310       } else { // interleaved
   2311          int i,j,k,x,y;
   2312          for (j=0; j < z->img_mcu_y; ++j) {
   2313             for (i=0; i < z->img_mcu_x; ++i) {
   2314                // scan an interleaved mcu... process scan_n components in order
   2315                for (k=0; k < z->scan_n; ++k) {
   2316                   int n = z->order[k];
   2317                   // scan out an mcu's worth of this component; that's just determined
   2318                   // by the basic H and V specified for the component
   2319                   for (y=0; y < z->img_comp[n].v; ++y) {
   2320                      for (x=0; x < z->img_comp[n].h; ++x) {
   2321                         int x2 = (i*z->img_comp[n].h + x);
   2322                         int y2 = (j*z->img_comp[n].v + y);
   2323                         short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
   2324                         if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
   2325                            return 0;
   2326                      }
   2327                   }
   2328                }
   2329                // after all interleaved components, that's an interleaved MCU,
   2330                // so now count down the restart interval
   2331                if (--z->todo <= 0) {
   2332                   if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
   2333                   if (!STBI__RESTART(z->marker)) return 1;
   2334                   stbi__jpeg_reset(z);
   2335                }
   2336             }
   2337          }
   2338          return 1;
   2339       }
   2340    }
   2341 }
   2342 
   2343 static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
   2344 {
   2345    int i;
   2346    for (i=0; i < 64; ++i)
   2347       data[i] *= dequant[i];
   2348 }
   2349 
   2350 static void stbi__jpeg_finish(stbi__jpeg *z)
   2351 {
   2352    if (z->progressive) {
   2353       // dequantize and idct the data
   2354       int i,j,n;
   2355       for (n=0; n < z->s->img_n; ++n) {
   2356          int w = (z->img_comp[n].x+7) >> 3;
   2357          int h = (z->img_comp[n].y+7) >> 3;
   2358          for (j=0; j < h; ++j) {
   2359             for (i=0; i < w; ++i) {
   2360                short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
   2361                stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
   2362                z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
   2363             }
   2364          }
   2365       }
   2366    }
   2367 }
   2368 
   2369 static int stbi__process_marker(stbi__jpeg *z, int m)
   2370 {
   2371    int L;
   2372    switch (m) {
   2373       case STBI__MARKER_none: // no marker found
   2374          return stbi__err("expected marker","Corrupt JPEG");
   2375 
   2376       case 0xDD: // DRI - specify restart interval
   2377          if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
   2378          z->restart_interval = stbi__get16be(z->s);
   2379          return 1;
   2380 
   2381       case 0xDB: // DQT - define quantization table
   2382          L = stbi__get16be(z->s)-2;
   2383          while (L > 0) {
   2384             int q = stbi__get8(z->s);
   2385             int p = q >> 4, sixteen = (p != 0);
   2386             int t = q & 15,i;
   2387             if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
   2388             if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
   2389 
   2390             for (i=0; i < 64; ++i)
   2391                z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
   2392             L -= (sixteen ? 129 : 65);
   2393          }
   2394          return L==0;
   2395 
   2396       case 0xC4: // DHT - define huffman table
   2397          L = stbi__get16be(z->s)-2;
   2398          while (L > 0) {
   2399             stbi_uc *v;
   2400             int sizes[16],i,n=0;
   2401             int q = stbi__get8(z->s);
   2402             int tc = q >> 4;
   2403             int th = q & 15;
   2404             if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
   2405             for (i=0; i < 16; ++i) {
   2406                sizes[i] = stbi__get8(z->s);
   2407                n += sizes[i];
   2408             }
   2409             L -= 17;
   2410             if (tc == 0) {
   2411                if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
   2412                v = z->huff_dc[th].values;
   2413             } else {
   2414                if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
   2415                v = z->huff_ac[th].values;
   2416             }
   2417             for (i=0; i < n; ++i)
   2418                v[i] = stbi__get8(z->s);
   2419             if (tc != 0)
   2420                stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
   2421             L -= n;
   2422          }
   2423          return L==0;
   2424    }
   2425 
   2426    // check for comment block or APP blocks
   2427    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
   2428       L = stbi__get16be(z->s);
   2429       if (L < 2) {
   2430          if (m == 0xFE)
   2431             return stbi__err("bad COM len","Corrupt JPEG");
   2432          else
   2433             return stbi__err("bad APP len","Corrupt JPEG");
   2434       }
   2435       L -= 2;
   2436 
   2437       if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
   2438          static const unsigned char tag[5] = {'J','F','I','F','\0'};
   2439          int ok = 1;
   2440          int i;
   2441          for (i=0; i < 5; ++i)
   2442             if (stbi__get8(z->s) != tag[i])
   2443                ok = 0;
   2444          L -= 5;
   2445          if (ok)
   2446             z->jfif = 1;
   2447       } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
   2448          static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
   2449          int ok = 1;
   2450          int i;
   2451          for (i=0; i < 6; ++i)
   2452             if (stbi__get8(z->s) != tag[i])
   2453                ok = 0;
   2454          L -= 6;
   2455          if (ok) {
   2456             stbi__get8(z->s); // version
   2457             stbi__get16be(z->s); // flags0
   2458             stbi__get16be(z->s); // flags1
   2459             z->app14_color_transform = stbi__get8(z->s); // color transform
   2460             L -= 6;
   2461          }
   2462       }
   2463 
   2464       stbi__skip(z->s, L);
   2465       return 1;
   2466    }
   2467 
   2468    return stbi__err("unknown marker","Corrupt JPEG");
   2469 }
   2470 
   2471 // after we see SOS
   2472 static int stbi__process_scan_header(stbi__jpeg *z)
   2473 {
   2474    int i;
   2475    int Ls = stbi__get16be(z->s);
   2476    z->scan_n = stbi__get8(z->s);
   2477    if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
   2478    if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
   2479    for (i=0; i < z->scan_n; ++i) {
   2480       int id = stbi__get8(z->s), which;
   2481       int q = stbi__get8(z->s);
   2482       for (which = 0; which < z->s->img_n; ++which)
   2483          if (z->img_comp[which].id == id)
   2484             break;
   2485       if (which == z->s->img_n) return 0; // no match
   2486       z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
   2487       z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
   2488       z->order[i] = which;
   2489    }
   2490 
   2491    {
   2492       int aa;
   2493       z->spec_start = stbi__get8(z->s);
   2494       z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
   2495       aa = stbi__get8(z->s);
   2496       z->succ_high = (aa >> 4);
   2497       z->succ_low  = (aa & 15);
   2498       if (z->progressive) {
   2499          if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
   2500             return stbi__err("bad SOS", "Corrupt JPEG");
   2501       } else {
   2502          if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
   2503          if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
   2504          z->spec_end = 63;
   2505       }
   2506    }
   2507 
   2508    return 1;
   2509 }
   2510 
   2511 static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
   2512 {
   2513    int i;
   2514    for (i=0; i < ncomp; ++i) {
   2515       if (z->img_comp[i].raw_data) {
   2516          STBI_FREE(z->img_comp[i].raw_data);
   2517          z->img_comp[i].raw_data = NULL;
   2518          z->img_comp[i].data = NULL;
   2519       }
   2520       if (z->img_comp[i].raw_coeff) {
   2521          STBI_FREE(z->img_comp[i].raw_coeff);
   2522          z->img_comp[i].raw_coeff = 0;
   2523          z->img_comp[i].coeff = 0;
   2524       }
   2525       if (z->img_comp[i].linebuf) {
   2526          STBI_FREE(z->img_comp[i].linebuf);
   2527          z->img_comp[i].linebuf = NULL;
   2528       }
   2529    }
   2530    return why;
   2531 }
   2532 
   2533 static int stbi__process_frame_header(stbi__jpeg *z, int scan)
   2534 {
   2535    stbi__context *s = z->s;
   2536    int Lf,p,i,q, h_max=1,v_max=1,c;
   2537    Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
   2538    p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
   2539    s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
   2540    s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
   2541    c = stbi__get8(s);
   2542    if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
   2543    s->img_n = c;
   2544    for (i=0; i < c; ++i) {
   2545       z->img_comp[i].data = NULL;
   2546       z->img_comp[i].linebuf = NULL;
   2547    }
   2548 
   2549    if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
   2550 
   2551    z->rgb = 0;
   2552    for (i=0; i < s->img_n; ++i) {
   2553       static const unsigned char rgb[3] = { 'R', 'G', 'B' };
   2554       z->img_comp[i].id = stbi__get8(s);
   2555       if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
   2556          ++z->rgb;
   2557       q = stbi__get8(s);
   2558       z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
   2559       z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
   2560       z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
   2561    }
   2562 
   2563    if (scan != STBI__SCAN_load) return 1;
   2564 
   2565    if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
   2566 
   2567    for (i=0; i < s->img_n; ++i) {
   2568       if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
   2569       if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
   2570    }
   2571 
   2572    // compute interleaved mcu info
   2573    z->img_h_max = h_max;
   2574    z->img_v_max = v_max;
   2575    z->img_mcu_w = h_max * 8;
   2576    z->img_mcu_h = v_max * 8;
   2577    // these sizes can't be more than 17 bits
   2578    z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
   2579    z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
   2580 
   2581    for (i=0; i < s->img_n; ++i) {
   2582       // number of effective pixels (e.g. for non-interleaved MCU)
   2583       z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
   2584       z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
   2585       // to simplify generation, we'll allocate enough memory to decode
   2586       // the bogus oversized data from using interleaved MCUs and their
   2587       // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
   2588       // discard the extra data until colorspace conversion
   2589       //
   2590       // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
   2591       // so these muls can't overflow with 32-bit ints (which we require)
   2592       z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
   2593       z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
   2594       z->img_comp[i].coeff = 0;
   2595       z->img_comp[i].raw_coeff = 0;
   2596       z->img_comp[i].linebuf = NULL;
   2597       z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
   2598       if (z->img_comp[i].raw_data == NULL)
   2599          return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
   2600       // align blocks for idct using mmx/sse
   2601       z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
   2602       if (z->progressive) {
   2603          // w2, h2 are multiples of 8 (see above)
   2604          z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
   2605          z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
   2606          z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
   2607          if (z->img_comp[i].raw_coeff == NULL)
   2608             return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
   2609          z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
   2610       }
   2611    }
   2612 
   2613    return 1;
   2614 }
   2615 
   2616 // use comparisons since in some cases we handle more than one case (e.g. SOF)
   2617 #define stbi__DNL(x)         ((x) == 0xdc)
   2618 #define stbi__SOI(x)         ((x) == 0xd8)
   2619 #define stbi__EOI(x)         ((x) == 0xd9)
   2620 #define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
   2621 #define stbi__SOS(x)         ((x) == 0xda)
   2622 
   2623 #define stbi__SOF_progressive(x)   ((x) == 0xc2)
   2624 
   2625 static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
   2626 {
   2627    int m;
   2628    z->jfif = 0;
   2629    z->app14_color_transform = -1; // valid values are 0,1,2
   2630    z->marker = STBI__MARKER_none; // initialize cached marker to empty
   2631    m = stbi__get_marker(z);
   2632    if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
   2633    if (scan == STBI__SCAN_type) return 1;
   2634    m = stbi__get_marker(z);
   2635    while (!stbi__SOF(m)) {
   2636       if (!stbi__process_marker(z,m)) return 0;
   2637       m = stbi__get_marker(z);
   2638       while (m == STBI__MARKER_none) {
   2639          // some files have extra padding after their blocks, so ok, we'll scan
   2640          if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
   2641          m = stbi__get_marker(z);
   2642       }
   2643    }
   2644    z->progressive = stbi__SOF_progressive(m);
   2645    if (!stbi__process_frame_header(z, scan)) return 0;
   2646    return 1;
   2647 }
   2648 
   2649 // decode image to YCbCr format
   2650 static int stbi__decode_jpeg_image(stbi__jpeg *j)
   2651 {
   2652    int m;
   2653    for (m = 0; m < 4; m++) {
   2654       j->img_comp[m].raw_data = NULL;
   2655       j->img_comp[m].raw_coeff = NULL;
   2656    }
   2657    j->restart_interval = 0;
   2658    if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
   2659    m = stbi__get_marker(j);
   2660    while (!stbi__EOI(m)) {
   2661       if (stbi__SOS(m)) {
   2662          if (!stbi__process_scan_header(j)) return 0;
   2663          if (!stbi__parse_entropy_coded_data(j)) return 0;
   2664          if (j->marker == STBI__MARKER_none ) {
   2665             // handle 0s at the end of image data from IP Kamera 9060
   2666             while (!stbi__at_eof(j->s)) {
   2667                int x = stbi__get8(j->s);
   2668                if (x == 255) {
   2669                   j->marker = stbi__get8(j->s);
   2670                   break;
   2671                }
   2672             }
   2673             // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
   2674          }
   2675       } else if (stbi__DNL(m)) {
   2676          int Ld = stbi__get16be(j->s);
   2677          stbi__uint32 NL = stbi__get16be(j->s);
   2678          if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
   2679          if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
   2680       } else {
   2681          if (!stbi__process_marker(j, m)) return 0;
   2682       }
   2683       m = stbi__get_marker(j);
   2684    }
   2685    if (j->progressive)
   2686       stbi__jpeg_finish(j);
   2687    return 1;
   2688 }
   2689 
   2690 // static jfif-centered resampling (across block boundaries)
   2691 
   2692 typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
   2693                                     int w, int hs);
   2694 
   2695 #define stbi__div4(x) ((stbi_uc) ((x) >> 2))
   2696 
   2697 static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
   2698 {
   2699    STBI_NOTUSED(out);
   2700    STBI_NOTUSED(in_far);
   2701    STBI_NOTUSED(w);
   2702    STBI_NOTUSED(hs);
   2703    return in_near;
   2704 }
   2705 
   2706 static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
   2707 {
   2708    // need to generate two samples vertically for every one in input
   2709    int i;
   2710    STBI_NOTUSED(hs);
   2711    for (i=0; i < w; ++i)
   2712       out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
   2713    return out;
   2714 }
   2715 
   2716 static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
   2717 {
   2718    // need to generate two samples horizontally for every one in input
   2719    int i;
   2720    stbi_uc *input = in_near;
   2721 
   2722    if (w == 1) {
   2723       // if only one sample, can't do any interpolation
   2724       out[0] = out[1] = input[0];
   2725       return out;
   2726    }
   2727 
   2728    out[0] = input[0];
   2729    out[1] = stbi__div4(input[0]*3 + input[1] + 2);
   2730    for (i=1; i < w-1; ++i) {
   2731       int n = 3*input[i]+2;
   2732       out[i*2+0] = stbi__div4(n+input[i-1]);
   2733       out[i*2+1] = stbi__div4(n+input[i+1]);
   2734    }
   2735    out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
   2736    out[i*2+1] = input[w-1];
   2737 
   2738    STBI_NOTUSED(in_far);
   2739    STBI_NOTUSED(hs);
   2740 
   2741    return out;
   2742 }
   2743 
   2744 #define stbi__div16(x) ((stbi_uc) ((x) >> 4))
   2745 
   2746 static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
   2747 {
   2748    // need to generate 2x2 samples for every one in input
   2749    int i,t0,t1;
   2750    if (w == 1) {
   2751       out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
   2752       return out;
   2753    }
   2754 
   2755    t1 = 3*in_near[0] + in_far[0];
   2756    out[0] = stbi__div4(t1+2);
   2757    for (i=1; i < w; ++i) {
   2758       t0 = t1;
   2759       t1 = 3*in_near[i]+in_far[i];
   2760       out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
   2761       out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
   2762    }
   2763    out[w*2-1] = stbi__div4(t1+2);
   2764 
   2765    STBI_NOTUSED(hs);
   2766 
   2767    return out;
   2768 }
   2769 
   2770 #if defined(STBI_SSE2) || defined(STBI_NEON)
   2771 static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
   2772 {
   2773    // need to generate 2x2 samples for every one in input
   2774    int i=0,t0,t1;
   2775 
   2776    if (w == 1) {
   2777       out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
   2778       return out;
   2779    }
   2780 
   2781    t1 = 3*in_near[0] + in_far[0];
   2782    // process groups of 8 pixels for as long as we can.
   2783    // note we can't handle the last pixel in a row in this loop
   2784    // because we need to handle the filter boundary conditions.
   2785    for (; i < ((w-1) & ~7); i += 8) {
   2786 #if defined(STBI_SSE2)
   2787       // load and perform the vertical filtering pass
   2788       // this uses 3*x + y = 4*x + (y - x)
   2789       __m128i zero  = _mm_setzero_si128();
   2790       __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
   2791       __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
   2792       __m128i farw  = _mm_unpacklo_epi8(farb, zero);
   2793       __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
   2794       __m128i diff  = _mm_sub_epi16(farw, nearw);
   2795       __m128i nears = _mm_slli_epi16(nearw, 2);
   2796       __m128i curr  = _mm_add_epi16(nears, diff); // current row
   2797 
   2798       // horizontal filter works the same based on shifted vers of current
   2799       // row. "prev" is current row shifted right by 1 pixel; we need to
   2800       // insert the previous pixel value (from t1).
   2801       // "next" is current row shifted left by 1 pixel, with first pixel
   2802       // of next block of 8 pixels added in.
   2803       __m128i prv0 = _mm_slli_si128(curr, 2);
   2804       __m128i nxt0 = _mm_srli_si128(curr, 2);
   2805       __m128i prev = _mm_insert_epi16(prv0, t1, 0);
   2806       __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
   2807 
   2808       // horizontal filter, polyphase implementation since it's convenient:
   2809       // even pixels = 3*cur + prev = cur*4 + (prev - cur)
   2810       // odd  pixels = 3*cur + next = cur*4 + (next - cur)
   2811       // note the shared term.
   2812       __m128i bias  = _mm_set1_epi16(8);
   2813       __m128i curs = _mm_slli_epi16(curr, 2);
   2814       __m128i prvd = _mm_sub_epi16(prev, curr);
   2815       __m128i nxtd = _mm_sub_epi16(next, curr);
   2816       __m128i curb = _mm_add_epi16(curs, bias);
   2817       __m128i even = _mm_add_epi16(prvd, curb);
   2818       __m128i odd  = _mm_add_epi16(nxtd, curb);
   2819 
   2820       // interleave even and odd pixels, then undo scaling.
   2821       __m128i int0 = _mm_unpacklo_epi16(even, odd);
   2822       __m128i int1 = _mm_unpackhi_epi16(even, odd);
   2823       __m128i de0  = _mm_srli_epi16(int0, 4);
   2824       __m128i de1  = _mm_srli_epi16(int1, 4);
   2825 
   2826       // pack and write output
   2827       __m128i outv = _mm_packus_epi16(de0, de1);
   2828       _mm_storeu_si128((__m128i *) (out + i*2), outv);
   2829 #elif defined(STBI_NEON)
   2830       // load and perform the vertical filtering pass
   2831       // this uses 3*x + y = 4*x + (y - x)
   2832       uint8x8_t farb  = vld1_u8(in_far + i);
   2833       uint8x8_t nearb = vld1_u8(in_near + i);
   2834       int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
   2835       int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
   2836       int16x8_t curr  = vaddq_s16(nears, diff); // current row
   2837 
   2838       // horizontal filter works the same based on shifted vers of current
   2839       // row. "prev" is current row shifted right by 1 pixel; we need to
   2840       // insert the previous pixel value (from t1).
   2841       // "next" is current row shifted left by 1 pixel, with first pixel
   2842       // of next block of 8 pixels added in.
   2843       int16x8_t prv0 = vextq_s16(curr, curr, 7);
   2844       int16x8_t nxt0 = vextq_s16(curr, curr, 1);
   2845       int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
   2846       int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
   2847 
   2848       // horizontal filter, polyphase implementation since it's convenient:
   2849       // even pixels = 3*cur + prev = cur*4 + (prev - cur)
   2850       // odd  pixels = 3*cur + next = cur*4 + (next - cur)
   2851       // note the shared term.
   2852       int16x8_t curs = vshlq_n_s16(curr, 2);
   2853       int16x8_t prvd = vsubq_s16(prev, curr);
   2854       int16x8_t nxtd = vsubq_s16(next, curr);
   2855       int16x8_t even = vaddq_s16(curs, prvd);
   2856       int16x8_t odd  = vaddq_s16(curs, nxtd);
   2857 
   2858       // undo scaling and round, then store with even/odd phases interleaved
   2859       uint8x8x2_t o;
   2860       o.val[0] = vqrshrun_n_s16(even, 4);
   2861       o.val[1] = vqrshrun_n_s16(odd,  4);
   2862       vst2_u8(out + i*2, o);
   2863 #endif
   2864 
   2865       // "previous" value for next iter
   2866       t1 = 3*in_near[i+7] + in_far[i+7];
   2867    }
   2868 
   2869    t0 = t1;
   2870    t1 = 3*in_near[i] + in_far[i];
   2871    out[i*2] = stbi__div16(3*t1 + t0 + 8);
   2872 
   2873    for (++i; i < w; ++i) {
   2874       t0 = t1;
   2875       t1 = 3*in_near[i]+in_far[i];
   2876       out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
   2877       out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
   2878    }
   2879    out[w*2-1] = stbi__div4(t1+2);
   2880 
   2881    STBI_NOTUSED(hs);
   2882 
   2883    return out;
   2884 }
   2885 #endif
   2886 
   2887 static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
   2888 {
   2889    // resample with nearest-neighbor
   2890    int i,j;
   2891    STBI_NOTUSED(in_far);
   2892    for (i=0; i < w; ++i)
   2893       for (j=0; j < hs; ++j)
   2894          out[i*hs+j] = in_near[i];
   2895    return out;
   2896 }
   2897 
   2898 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
   2899 // to make sure the code produces the same results in both SIMD and scalar
   2900 #define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
   2901 static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
   2902 {
   2903    int i;
   2904    for (i=0; i < count; ++i) {
   2905       int y_fixed = (y[i] << 20) + (1<<19); // rounding
   2906       int r,g,b;
   2907       int cr = pcr[i] - 128;
   2908       int cb = pcb[i] - 128;
   2909       r = y_fixed +  cr* stbi__float2fixed(1.40200f);
   2910       g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
   2911       b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
   2912       r >>= 20;
   2913       g >>= 20;
   2914       b >>= 20;
   2915       if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
   2916       if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
   2917       if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
   2918       out[0] = (stbi_uc)r;
   2919       out[1] = (stbi_uc)g;
   2920       out[2] = (stbi_uc)b;
   2921       out[3] = 255;
   2922       out += step;
   2923    }
   2924 }
   2925 
   2926 #if defined(STBI_SSE2) || defined(STBI_NEON)
   2927 static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
   2928 {
   2929    int i = 0;
   2930 
   2931 #ifdef STBI_SSE2
   2932    // step == 3 is pretty ugly on the final interleave, and i'm not convinced
   2933    // it's useful in practice (you wouldn't use it for textures, for example).
   2934    // so just accelerate step == 4 case.
   2935    if (step == 4) {
   2936       // this is a fairly straightforward implementation and not super-optimized.
   2937       __m128i signflip  = _mm_set1_epi8(-0x80);
   2938       __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
   2939       __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
   2940       __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
   2941       __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
   2942       __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
   2943       __m128i xw = _mm_set1_epi16(255); // alpha channel
   2944 
   2945       for (; i+7 < count; i += 8) {
   2946          // load
   2947          __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
   2948          __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
   2949          __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
   2950          __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
   2951          __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
   2952 
   2953          // unpack to short (and left-shift cr, cb by 8)
   2954          __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
   2955          __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
   2956          __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
   2957 
   2958          // color transform
   2959          __m128i yws = _mm_srli_epi16(yw, 4);
   2960          __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
   2961          __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
   2962          __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
   2963          __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
   2964          __m128i rws = _mm_add_epi16(cr0, yws);
   2965          __m128i gwt = _mm_add_epi16(cb0, yws);
   2966          __m128i bws = _mm_add_epi16(yws, cb1);
   2967          __m128i gws = _mm_add_epi16(gwt, cr1);
   2968 
   2969          // descale
   2970          __m128i rw = _mm_srai_epi16(rws, 4);
   2971          __m128i bw = _mm_srai_epi16(bws, 4);
   2972          __m128i gw = _mm_srai_epi16(gws, 4);
   2973 
   2974          // back to byte, set up for transpose
   2975          __m128i brb = _mm_packus_epi16(rw, bw);
   2976          __m128i gxb = _mm_packus_epi16(gw, xw);
   2977 
   2978          // transpose to interleave channels
   2979          __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
   2980          __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
   2981          __m128i o0 = _mm_unpacklo_epi16(t0, t1);
   2982          __m128i o1 = _mm_unpackhi_epi16(t0, t1);
   2983 
   2984          // store
   2985          _mm_storeu_si128((__m128i *) (out + 0), o0);
   2986          _mm_storeu_si128((__m128i *) (out + 16), o1);
   2987          out += 32;
   2988       }
   2989    }
   2990 #endif
   2991 
   2992 #ifdef STBI_NEON
   2993    // in this version, step=3 support would be easy to add. but is there demand?
   2994    if (step == 4) {
   2995       // this is a fairly straightforward implementation and not super-optimized.
   2996       uint8x8_t signflip = vdup_n_u8(0x80);
   2997       int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
   2998       int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
   2999       int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
   3000       int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
   3001 
   3002       for (; i+7 < count; i += 8) {
   3003          // load
   3004          uint8x8_t y_bytes  = vld1_u8(y + i);
   3005          uint8x8_t cr_bytes = vld1_u8(pcr + i);
   3006          uint8x8_t cb_bytes = vld1_u8(pcb + i);
   3007          int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
   3008          int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
   3009 
   3010          // expand to s16
   3011          int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
   3012          int16x8_t crw = vshll_n_s8(cr_biased, 7);
   3013          int16x8_t cbw = vshll_n_s8(cb_biased, 7);
   3014 
   3015          // color transform
   3016          int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
   3017          int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
   3018          int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
   3019          int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
   3020          int16x8_t rws = vaddq_s16(yws, cr0);
   3021          int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
   3022          int16x8_t bws = vaddq_s16(yws, cb1);
   3023 
   3024          // undo scaling, round, convert to byte
   3025          uint8x8x4_t o;
   3026          o.val[0] = vqrshrun_n_s16(rws, 4);
   3027          o.val[1] = vqrshrun_n_s16(gws, 4);
   3028          o.val[2] = vqrshrun_n_s16(bws, 4);
   3029          o.val[3] = vdup_n_u8(255);
   3030 
   3031          // store, interleaving r/g/b/a
   3032          vst4_u8(out, o);
   3033          out += 8*4;
   3034       }
   3035    }
   3036 #endif
   3037 
   3038    for (; i < count; ++i) {
   3039       int y_fixed = (y[i] << 20) + (1<<19); // rounding
   3040       int r,g,b;
   3041       int cr = pcr[i] - 128;
   3042       int cb = pcb[i] - 128;
   3043       r = y_fixed + cr* stbi__float2fixed(1.40200f);
   3044       g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
   3045       b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
   3046       r >>= 20;
   3047       g >>= 20;
   3048       b >>= 20;
   3049       if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
   3050       if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
   3051       if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
   3052       out[0] = (stbi_uc)r;
   3053       out[1] = (stbi_uc)g;
   3054       out[2] = (stbi_uc)b;
   3055       out[3] = 255;
   3056       out += step;
   3057    }
   3058 }
   3059 #endif
   3060 
   3061 // set up the kernels
   3062 static void stbi__setup_jpeg(stbi__jpeg *j)
   3063 {
   3064    j->idct_block_kernel = stbi__idct_block;
   3065    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
   3066    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
   3067 
   3068 #ifdef STBI_SSE2
   3069    if (stbi__sse2_available()) {
   3070       j->idct_block_kernel = stbi__idct_simd;
   3071       j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
   3072       j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
   3073    }
   3074 #endif
   3075 
   3076 #ifdef STBI_NEON
   3077    j->idct_block_kernel = stbi__idct_simd;
   3078    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
   3079    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
   3080 #endif
   3081 }
   3082 
   3083 // clean up the temporary component buffers
   3084 static void stbi__cleanup_jpeg(stbi__jpeg *j)
   3085 {
   3086    stbi__free_jpeg_components(j, j->s->img_n, 0);
   3087 }
   3088 
   3089 typedef struct
   3090 {
   3091    resample_row_func resample;
   3092    stbi_uc *line0,*line1;
   3093    int hs,vs;   // expansion factor in each axis
   3094    int w_lores; // horizontal pixels pre-expansion
   3095    int ystep;   // how far through vertical expansion we are
   3096    int ypos;    // which pre-expansion row we're on
   3097 } stbi__resample;
   3098 
   3099 // fast 0..255 * 0..255 => 0..255 rounded multiplication
   3100 static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
   3101 {
   3102    unsigned int t = x*y + 128;
   3103    return (stbi_uc) ((t + (t >>8)) >> 8);
   3104 }
   3105 
   3106 static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
   3107 {
   3108    int n, decode_n, is_rgb;
   3109    z->s->img_n = 0; // make stbi__cleanup_jpeg safe
   3110 
   3111    // validate req_comp
   3112    if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
   3113 
   3114    // load a jpeg image from whichever source, but leave in YCbCr format
   3115    if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
   3116 
   3117    // determine actual number of components to generate
   3118    n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
   3119 
   3120    is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
   3121 
   3122    if (z->s->img_n == 3 && n < 3 && !is_rgb)
   3123       decode_n = 1;
   3124    else
   3125       decode_n = z->s->img_n;
   3126 
   3127    // resample and color-convert
   3128    {
   3129       int k;
   3130       unsigned int i,j;
   3131       stbi_uc *output;
   3132       stbi_uc *coutput[4];
   3133 
   3134       stbi__resample res_comp[4];
   3135 
   3136       for (k=0; k < decode_n; ++k) {
   3137          stbi__resample *r = &res_comp[k];
   3138 
   3139          // allocate line buffer big enough for upsampling off the edges
   3140          // with upsample factor of 4
   3141          z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
   3142          if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
   3143 
   3144          r->hs      = z->img_h_max / z->img_comp[k].h;
   3145          r->vs      = z->img_v_max / z->img_comp[k].v;
   3146          r->ystep   = r->vs >> 1;
   3147          r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
   3148          r->ypos    = 0;
   3149          r->line0   = r->line1 = z->img_comp[k].data;
   3150 
   3151          if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
   3152          else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
   3153          else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
   3154          else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
   3155          else                               r->resample = stbi__resample_row_generic;
   3156       }
   3157 
   3158       // can't error after this so, this is safe
   3159       output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
   3160       if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
   3161 
   3162       // now go ahead and resample
   3163       for (j=0; j < z->s->img_y; ++j) {
   3164          stbi_uc *out = output + n * z->s->img_x * j;
   3165          for (k=0; k < decode_n; ++k) {
   3166             stbi__resample *r = &res_comp[k];
   3167             int y_bot = r->ystep >= (r->vs >> 1);
   3168             coutput[k] = r->resample(z->img_comp[k].linebuf,
   3169                                      y_bot ? r->line1 : r->line0,
   3170                                      y_bot ? r->line0 : r->line1,
   3171                                      r->w_lores, r->hs);
   3172             if (++r->ystep >= r->vs) {
   3173                r->ystep = 0;
   3174                r->line0 = r->line1;
   3175                if (++r->ypos < z->img_comp[k].y)
   3176                   r->line1 += z->img_comp[k].w2;
   3177             }
   3178          }
   3179          if (n >= 3) {
   3180             stbi_uc *y = coutput[0];
   3181             if (z->s->img_n == 3) {
   3182                if (is_rgb) {
   3183                   for (i=0; i < z->s->img_x; ++i) {
   3184                      out[0] = y[i];
   3185                      out[1] = coutput[1][i];
   3186                      out[2] = coutput[2][i];
   3187                      out[3] = 255;
   3188                      out += n;
   3189                   }
   3190                } else {
   3191                   z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
   3192                }
   3193             } else if (z->s->img_n == 4) {
   3194                if (z->app14_color_transform == 0) { // CMYK
   3195                   for (i=0; i < z->s->img_x; ++i) {
   3196                      stbi_uc m = coutput[3][i];
   3197                      out[0] = stbi__blinn_8x8(coutput[0][i], m);
   3198                      out[1] = stbi__blinn_8x8(coutput[1][i], m);
   3199                      out[2] = stbi__blinn_8x8(coutput[2][i], m);
   3200                      out[3] = 255;
   3201                      out += n;
   3202                   }
   3203                } else if (z->app14_color_transform == 2) { // YCCK
   3204                   z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
   3205                   for (i=0; i < z->s->img_x; ++i) {
   3206                      stbi_uc m = coutput[3][i];
   3207                      out[0] = stbi__blinn_8x8(255 - out[0], m);
   3208                      out[1] = stbi__blinn_8x8(255 - out[1], m);
   3209                      out[2] = stbi__blinn_8x8(255 - out[2], m);
   3210                      out += n;
   3211                   }
   3212                } else { // YCbCr + alpha?  Ignore the fourth channel for now
   3213                   z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
   3214                }
   3215             } else
   3216                for (i=0; i < z->s->img_x; ++i) {
   3217                   out[0] = out[1] = out[2] = y[i];
   3218                   out[3] = 255; // not used if n==3
   3219                   out += n;
   3220                }
   3221          } else {
   3222             if (is_rgb) {
   3223                if (n == 1)
   3224                   for (i=0; i < z->s->img_x; ++i)
   3225                      *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
   3226                else {
   3227                   for (i=0; i < z->s->img_x; ++i, out += 2) {
   3228                      out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
   3229                      out[1] = 255;
   3230                   }
   3231                }
   3232             } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
   3233                for (i=0; i < z->s->img_x; ++i) {
   3234                   stbi_uc m = coutput[3][i];
   3235                   stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
   3236                   stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
   3237                   stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
   3238                   out[0] = stbi__compute_y(r, g, b);
   3239                   out[1] = 255;
   3240                   out += n;
   3241                }
   3242             } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
   3243                for (i=0; i < z->s->img_x; ++i) {
   3244                   out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
   3245                   out[1] = 255;
   3246                   out += n;
   3247                }
   3248             } else {
   3249                stbi_uc *y = coutput[0];
   3250                if (n == 1)
   3251                   for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
   3252                else
   3253                   for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255;
   3254             }
   3255          }
   3256       }
   3257       stbi__cleanup_jpeg(z);
   3258       *out_x = z->s->img_x;
   3259       *out_y = z->s->img_y;
   3260       if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
   3261       return output;
   3262    }
   3263 }
   3264 
   3265 static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
   3266 {
   3267    unsigned char* result;
   3268    stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
   3269    STBI_NOTUSED(ri);
   3270    j->s = s;
   3271    stbi__setup_jpeg(j);
   3272    result = load_jpeg_image(j, x,y,comp,req_comp);
   3273    STBI_FREE(j);
   3274    return result;
   3275 }
   3276 
   3277 static int stbi__jpeg_test(stbi__context *s)
   3278 {
   3279    int r;
   3280    stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
   3281    j->s = s;
   3282    stbi__setup_jpeg(j);
   3283    r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
   3284    stbi__rewind(s);
   3285    STBI_FREE(j);
   3286    return r;
   3287 }
   3288 
   3289 static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
   3290 {
   3291    if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
   3292       stbi__rewind( j->s );
   3293       return 0;
   3294    }
   3295    if (x) *x = j->s->img_x;
   3296    if (y) *y = j->s->img_y;
   3297    if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
   3298    return 1;
   3299 }
   3300 
   3301 static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
   3302 {
   3303    int result;
   3304    stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
   3305    j->s = s;
   3306    result = stbi__jpeg_info_raw(j, x, y, comp);
   3307    STBI_FREE(j);
   3308    return result;
   3309 }
   3310 #endif
   3311 
   3312 // public domain zlib decode    v0.2  Sean Barrett 2006-11-18
   3313 //    simple implementation
   3314 //      - all input must be provided in an upfront buffer
   3315 //      - all output is written to a single output buffer (can malloc/realloc)
   3316 //    performance
   3317 //      - fast huffman
   3318 
   3319 #ifndef STBI_NO_ZLIB
   3320 
   3321 // fast-way is faster to check than jpeg huffman, but slow way is slower
   3322 #define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
   3323 #define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
   3324 
   3325 // zlib-style huffman encoding
   3326 // (jpegs packs from left, zlib from right, so can't share code)
   3327 typedef struct
   3328 {
   3329    stbi__uint16 fast[1 << STBI__ZFAST_BITS];
   3330    stbi__uint16 firstcode[16];
   3331    int maxcode[17];
   3332    stbi__uint16 firstsymbol[16];
   3333    stbi_uc  size[288];
   3334    stbi__uint16 value[288];
   3335 } stbi__zhuffman;
   3336 
   3337 stbi_inline static int stbi__bitreverse16(int n)
   3338 {
   3339   n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
   3340   n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
   3341   n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
   3342   n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
   3343   return n;
   3344 }
   3345 
   3346 stbi_inline static int stbi__bit_reverse(int v, int bits)
   3347 {
   3348    STBI_ASSERT(bits <= 16);
   3349    // to bit reverse n bits, reverse 16 and shift
   3350    // e.g. 11 bits, bit reverse and shift away 5
   3351    return stbi__bitreverse16(v) >> (16-bits);
   3352 }
   3353 
   3354 static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
   3355 {
   3356    int i,k=0;
   3357    int code, next_code[16], sizes[17];
   3358 
   3359    // DEFLATE spec for generating codes
   3360    memset(sizes, 0, sizeof(sizes));
   3361    memset(z->fast, 0, sizeof(z->fast));
   3362    for (i=0; i < num; ++i)
   3363       ++sizes[sizelist[i]];
   3364    sizes[0] = 0;
   3365    for (i=1; i < 16; ++i)
   3366       if (sizes[i] > (1 << i))
   3367          return stbi__err("bad sizes", "Corrupt PNG");
   3368    code = 0;
   3369    for (i=1; i < 16; ++i) {
   3370       next_code[i] = code;
   3371       z->firstcode[i] = (stbi__uint16) code;
   3372       z->firstsymbol[i] = (stbi__uint16) k;
   3373       code = (code + sizes[i]);
   3374       if (sizes[i])
   3375          if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
   3376       z->maxcode[i] = code << (16-i); // preshift for inner loop
   3377       code <<= 1;
   3378       k += sizes[i];
   3379    }
   3380    z->maxcode[16] = 0x10000; // sentinel
   3381    for (i=0; i < num; ++i) {
   3382       int s = sizelist[i];
   3383       if (s) {
   3384          int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
   3385          stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
   3386          z->size [c] = (stbi_uc     ) s;
   3387          z->value[c] = (stbi__uint16) i;
   3388          if (s <= STBI__ZFAST_BITS) {
   3389             int j = stbi__bit_reverse(next_code[s],s);
   3390             while (j < (1 << STBI__ZFAST_BITS)) {
   3391                z->fast[j] = fastv;
   3392                j += (1 << s);
   3393             }
   3394          }
   3395          ++next_code[s];
   3396       }
   3397    }
   3398    return 1;
   3399 }
   3400 
   3401 // zlib-from-memory implementation for PNG reading
   3402 //    because PNG allows splitting the zlib stream arbitrarily,
   3403 //    and it's annoying structurally to have PNG call ZLIB call PNG,
   3404 //    we require PNG read all the IDATs and combine them into a single
   3405 //    memory buffer
   3406 
   3407 typedef struct
   3408 {
   3409    stbi_uc *zbuffer, *zbuffer_end;
   3410    int num_bits;
   3411    stbi__uint32 code_buffer;
   3412 
   3413    char *zout;
   3414    char *zout_start;
   3415    char *zout_end;
   3416    int   z_expandable;
   3417 
   3418    stbi__zhuffman z_length, z_distance;
   3419 } stbi__zbuf;
   3420 
   3421 stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
   3422 {
   3423    if (z->zbuffer >= z->zbuffer_end) return 0;
   3424    return *z->zbuffer++;
   3425 }
   3426 
   3427 static void stbi__fill_bits(stbi__zbuf *z)
   3428 {
   3429    do {
   3430       STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
   3431       z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
   3432       z->num_bits += 8;
   3433    } while (z->num_bits <= 24);
   3434 }
   3435 
   3436 stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
   3437 {
   3438    unsigned int k;
   3439    if (z->num_bits < n) stbi__fill_bits(z);
   3440    k = z->code_buffer & ((1 << n) - 1);
   3441    z->code_buffer >>= n;
   3442    z->num_bits -= n;
   3443    return k;
   3444 }
   3445 
   3446 static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
   3447 {
   3448    int b,s,k;
   3449    // not resolved by fast table, so compute it the slow way
   3450    // use jpeg approach, which requires MSbits at top
   3451    k = stbi__bit_reverse(a->code_buffer, 16);
   3452    for (s=STBI__ZFAST_BITS+1; ; ++s)
   3453       if (k < z->maxcode[s])
   3454          break;
   3455    if (s == 16) return -1; // invalid code!
   3456    // code size is s, so:
   3457    b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
   3458    STBI_ASSERT(z->size[b] == s);
   3459    a->code_buffer >>= s;
   3460    a->num_bits -= s;
   3461    return z->value[b];
   3462 }
   3463 
   3464 stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
   3465 {
   3466    int b,s;
   3467    if (a->num_bits < 16) stbi__fill_bits(a);
   3468    b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
   3469    if (b) {
   3470       s = b >> 9;
   3471       a->code_buffer >>= s;
   3472       a->num_bits -= s;
   3473       return b & 511;
   3474    }
   3475    return stbi__zhuffman_decode_slowpath(a, z);
   3476 }
   3477 
   3478 static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
   3479 {
   3480    char *q;
   3481    int cur, limit, old_limit;
   3482    z->zout = zout;
   3483    if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
   3484    cur   = (int) (z->zout     - z->zout_start);
   3485    limit = old_limit = (int) (z->zout_end - z->zout_start);
   3486    while (cur + n > limit)
   3487       limit *= 2;
   3488    q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
   3489    STBI_NOTUSED(old_limit);
   3490    if (q == NULL) return stbi__err("outofmem", "Out of memory");
   3491    z->zout_start = q;
   3492    z->zout       = q + cur;
   3493    z->zout_end   = q + limit;
   3494    return 1;
   3495 }
   3496 
   3497 static const int stbi__zlength_base[31] = {
   3498    3,4,5,6,7,8,9,10,11,13,
   3499    15,17,19,23,27,31,35,43,51,59,
   3500    67,83,99,115,131,163,195,227,258,0,0 };
   3501 
   3502 static const int stbi__zlength_extra[31]=
   3503 { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
   3504 
   3505 static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
   3506 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
   3507 
   3508 static const int stbi__zdist_extra[32] =
   3509 { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
   3510 
   3511 static int stbi__parse_huffman_block(stbi__zbuf *a)
   3512 {
   3513    char *zout = a->zout;
   3514    for(;;) {
   3515       int z = stbi__zhuffman_decode(a, &a->z_length);
   3516       if (z < 256) {
   3517          if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
   3518          if (zout >= a->zout_end) {
   3519             if (!stbi__zexpand(a, zout, 1)) return 0;
   3520             zout = a->zout;
   3521          }
   3522          *zout++ = (char) z;
   3523       } else {
   3524          stbi_uc *p;
   3525          int len,dist;
   3526          if (z == 256) {
   3527             a->zout = zout;
   3528             return 1;
   3529          }
   3530          z -= 257;
   3531          len = stbi__zlength_base[z];
   3532          if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
   3533          z = stbi__zhuffman_decode(a, &a->z_distance);
   3534          if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
   3535          dist = stbi__zdist_base[z];
   3536          if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
   3537          if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
   3538          if (zout + len > a->zout_end) {
   3539             if (!stbi__zexpand(a, zout, len)) return 0;
   3540             zout = a->zout;
   3541          }
   3542          p = (stbi_uc *) (zout - dist);
   3543          if (dist == 1) { // run of one byte; common in images.
   3544             stbi_uc v = *p;
   3545             if (len) { do *zout++ = v; while (--len); }
   3546          } else {
   3547             if (len) { do *zout++ = *p++; while (--len); }
   3548          }
   3549       }
   3550    }
   3551 }
   3552 
   3553 static int stbi__compute_huffman_codes(stbi__zbuf *a)
   3554 {
   3555    static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
   3556    stbi__zhuffman z_codelength;
   3557    stbi_uc lencodes[286+32+137];//padding for maximum single op
   3558    stbi_uc codelength_sizes[19];
   3559    int i,n;
   3560 
   3561    int hlit  = stbi__zreceive(a,5) + 257;
   3562    int hdist = stbi__zreceive(a,5) + 1;
   3563    int hclen = stbi__zreceive(a,4) + 4;
   3564    int ntot  = hlit + hdist;
   3565 
   3566    memset(codelength_sizes, 0, sizeof(codelength_sizes));
   3567    for (i=0; i < hclen; ++i) {
   3568       int s = stbi__zreceive(a,3);
   3569       codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
   3570    }
   3571    if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
   3572 
   3573    n = 0;
   3574    while (n < ntot) {
   3575       int c = stbi__zhuffman_decode(a, &z_codelength);
   3576       if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
   3577       if (c < 16)
   3578          lencodes[n++] = (stbi_uc) c;
   3579       else {
   3580          stbi_uc fill = 0;
   3581          if (c == 16) {
   3582             c = stbi__zreceive(a,2)+3;
   3583             if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
   3584             fill = lencodes[n-1];
   3585          } else if (c == 17)
   3586             c = stbi__zreceive(a,3)+3;
   3587          else {
   3588             STBI_ASSERT(c == 18);
   3589             c = stbi__zreceive(a,7)+11;
   3590          }
   3591          if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
   3592          memset(lencodes+n, fill, c);
   3593          n += c;
   3594       }
   3595    }
   3596    if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
   3597    if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
   3598    if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
   3599    return 1;
   3600 }
   3601 
   3602 static int stbi__parse_uncompressed_block(stbi__zbuf *a)
   3603 {
   3604    stbi_uc header[4];
   3605    int len,nlen,k;
   3606    if (a->num_bits & 7)
   3607       stbi__zreceive(a, a->num_bits & 7); // discard
   3608    // drain the bit-packed data into header
   3609    k = 0;
   3610    while (a->num_bits > 0) {
   3611       header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
   3612       a->code_buffer >>= 8;
   3613       a->num_bits -= 8;
   3614    }
   3615    STBI_ASSERT(a->num_bits == 0);
   3616    // now fill header the normal way
   3617    while (k < 4)
   3618       header[k++] = stbi__zget8(a);
   3619    len  = header[1] * 256 + header[0];
   3620    nlen = header[3] * 256 + header[2];
   3621    if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
   3622    if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
   3623    if (a->zout + len > a->zout_end)
   3624       if (!stbi__zexpand(a, a->zout, len)) return 0;
   3625    memcpy(a->zout, a->zbuffer, len);
   3626    a->zbuffer += len;
   3627    a->zout += len;
   3628    return 1;
   3629 }
   3630 
   3631 static int stbi__parse_zlib_header(stbi__zbuf *a)
   3632 {
   3633    int cmf   = stbi__zget8(a);
   3634    int cm    = cmf & 15;
   3635    /* int cinfo = cmf >> 4; */
   3636    int flg   = stbi__zget8(a);
   3637    if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
   3638    if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
   3639    if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
   3640    // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
   3641    return 1;
   3642 }
   3643 
   3644 static const stbi_uc stbi__zdefault_length[288] =
   3645 {
   3646    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
   3647    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
   3648    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
   3649    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
   3650    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   3651    9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   3652    9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   3653    9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   3654    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
   3655 };
   3656 static const stbi_uc stbi__zdefault_distance[32] =
   3657 {
   3658    5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
   3659 };
   3660 /*
   3661 Init algorithm:
   3662 {
   3663    int i;   // use <= to match clearly with spec
   3664    for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
   3665    for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
   3666    for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
   3667    for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
   3668 
   3669    for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
   3670 }
   3671 */
   3672 
   3673 static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
   3674 {
   3675    int final, type;
   3676    if (parse_header)
   3677       if (!stbi__parse_zlib_header(a)) return 0;
   3678    a->num_bits = 0;
   3679    a->code_buffer = 0;
   3680    do {
   3681       final = stbi__zreceive(a,1);
   3682       type = stbi__zreceive(a,2);
   3683       if (type == 0) {
   3684          if (!stbi__parse_uncompressed_block(a)) return 0;
   3685       } else if (type == 3) {
   3686          return 0;
   3687       } else {
   3688          if (type == 1) {
   3689             // use fixed code lengths
   3690             if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
   3691             if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
   3692          } else {
   3693             if (!stbi__compute_huffman_codes(a)) return 0;
   3694          }
   3695          if (!stbi__parse_huffman_block(a)) return 0;
   3696       }
   3697    } while (!final);
   3698    return 1;
   3699 }
   3700 
   3701 static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
   3702 {
   3703    a->zout_start = obuf;
   3704    a->zout       = obuf;
   3705    a->zout_end   = obuf + olen;
   3706    a->z_expandable = exp;
   3707 
   3708    return stbi__parse_zlib(a, parse_header);
   3709 }
   3710 
   3711 STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
   3712 {
   3713    stbi__zbuf a;
   3714    char *p = (char *) stbi__malloc(initial_size);
   3715    if (p == NULL) return NULL;
   3716    a.zbuffer = (stbi_uc *) buffer;
   3717    a.zbuffer_end = (stbi_uc *) buffer + len;
   3718    if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
   3719       if (outlen) *outlen = (int) (a.zout - a.zout_start);
   3720       return a.zout_start;
   3721    } else {
   3722       STBI_FREE(a.zout_start);
   3723       return NULL;
   3724    }
   3725 }
   3726 
   3727 STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
   3728 {
   3729    return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
   3730 }
   3731 
   3732 STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
   3733 {
   3734    stbi__zbuf a;
   3735    char *p = (char *) stbi__malloc(initial_size);
   3736    if (p == NULL) return NULL;
   3737    a.zbuffer = (stbi_uc *) buffer;
   3738    a.zbuffer_end = (stbi_uc *) buffer + len;
   3739    if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
   3740       if (outlen) *outlen = (int) (a.zout - a.zout_start);
   3741       return a.zout_start;
   3742    } else {
   3743       STBI_FREE(a.zout_start);
   3744       return NULL;
   3745    }
   3746 }
   3747 
   3748 STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
   3749 {
   3750    stbi__zbuf a;
   3751    a.zbuffer = (stbi_uc *) ibuffer;
   3752    a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
   3753    if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
   3754       return (int) (a.zout - a.zout_start);
   3755    else
   3756       return -1;
   3757 }
   3758 
   3759 STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
   3760 {
   3761    stbi__zbuf a;
   3762    char *p = (char *) stbi__malloc(16384);
   3763    if (p == NULL) return NULL;
   3764    a.zbuffer = (stbi_uc *) buffer;
   3765    a.zbuffer_end = (stbi_uc *) buffer+len;
   3766    if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
   3767       if (outlen) *outlen = (int) (a.zout - a.zout_start);
   3768       return a.zout_start;
   3769    } else {
   3770       STBI_FREE(a.zout_start);
   3771       return NULL;
   3772    }
   3773 }
   3774 
   3775 STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
   3776 {
   3777    stbi__zbuf a;
   3778    a.zbuffer = (stbi_uc *) ibuffer;
   3779    a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
   3780    if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
   3781       return (int) (a.zout - a.zout_start);
   3782    else
   3783       return -1;
   3784 }
   3785 #endif
   3786 
   3787 // public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
   3788 //    simple implementation
   3789 //      - only 8-bit samples
   3790 //      - no CRC checking
   3791 //      - allocates lots of intermediate memory
   3792 //        - avoids problem of streaming data between subsystems
   3793 //        - avoids explicit window management
   3794 //    performance
   3795 //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
   3796 
   3797 #ifndef STBI_NO_PNG
   3798 typedef struct
   3799 {
   3800    stbi__uint32 length;
   3801    stbi__uint32 type;
   3802 } stbi__pngchunk;
   3803 
   3804 static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
   3805 {
   3806    stbi__pngchunk c;
   3807    c.length = stbi__get32be(s);
   3808    c.type   = stbi__get32be(s);
   3809    return c;
   3810 }
   3811 
   3812 static int stbi__check_png_header(stbi__context *s)
   3813 {
   3814    static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
   3815    int i;
   3816    for (i=0; i < 8; ++i)
   3817       if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
   3818    return 1;
   3819 }
   3820 
   3821 typedef struct
   3822 {
   3823    stbi__context *s;
   3824    stbi_uc *idata, *expanded, *out;
   3825    int depth;
   3826 } stbi__png;
   3827 
   3828 
   3829 enum {
   3830    STBI__F_none=0,
   3831    STBI__F_sub=1,
   3832    STBI__F_up=2,
   3833    STBI__F_avg=3,
   3834    STBI__F_paeth=4,
   3835    // synthetic filters used for first scanline to avoid needing a dummy row of 0s
   3836    STBI__F_avg_first,
   3837    STBI__F_paeth_first
   3838 };
   3839 
   3840 static stbi_uc first_row_filter[5] =
   3841 {
   3842    STBI__F_none,
   3843    STBI__F_sub,
   3844    STBI__F_none,
   3845    STBI__F_avg_first,
   3846    STBI__F_paeth_first
   3847 };
   3848 
   3849 static int stbi__paeth(int a, int b, int c)
   3850 {
   3851    int p = a + b - c;
   3852    int pa = abs(p-a);
   3853    int pb = abs(p-b);
   3854    int pc = abs(p-c);
   3855    if (pa <= pb && pa <= pc) return a;
   3856    if (pb <= pc) return b;
   3857    return c;
   3858 }
   3859 
   3860 static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
   3861 
   3862 // create the png data from post-deflated data
   3863 static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
   3864 {
   3865    int bytes = (depth == 16? 2 : 1);
   3866    stbi__context *s = a->s;
   3867    stbi__uint32 i,j,stride = x*out_n*bytes;
   3868    stbi__uint32 img_len, img_width_bytes;
   3869    int k;
   3870    int img_n = s->img_n; // copy it into a local for later
   3871 
   3872    int output_bytes = out_n*bytes;
   3873    int filter_bytes = img_n*bytes;
   3874    int width = x;
   3875 
   3876    STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
   3877    a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
   3878    if (!a->out) return stbi__err("outofmem", "Out of memory");
   3879 
   3880    if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
   3881    img_width_bytes = (((img_n * x * depth) + 7) >> 3);
   3882    img_len = (img_width_bytes + 1) * y;
   3883 
   3884    // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
   3885    // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
   3886    // so just check for raw_len < img_len always.
   3887    if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
   3888 
   3889    for (j=0; j < y; ++j) {
   3890       stbi_uc *cur = a->out + stride*j;
   3891       stbi_uc *prior;
   3892       int filter = *raw++;
   3893 
   3894       if (filter > 4)
   3895          return stbi__err("invalid filter","Corrupt PNG");
   3896 
   3897       if (depth < 8) {
   3898          STBI_ASSERT(img_width_bytes <= x);
   3899          cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
   3900          filter_bytes = 1;
   3901          width = img_width_bytes;
   3902       }
   3903       prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
   3904 
   3905       // if first row, use special filter that doesn't sample previous row
   3906       if (j == 0) filter = first_row_filter[filter];
   3907 
   3908       // handle first byte explicitly
   3909       for (k=0; k < filter_bytes; ++k) {
   3910          switch (filter) {
   3911             case STBI__F_none       : cur[k] = raw[k]; break;
   3912             case STBI__F_sub        : cur[k] = raw[k]; break;
   3913             case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
   3914             case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
   3915             case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
   3916             case STBI__F_avg_first  : cur[k] = raw[k]; break;
   3917             case STBI__F_paeth_first: cur[k] = raw[k]; break;
   3918          }
   3919       }
   3920 
   3921       if (depth == 8) {
   3922          if (img_n != out_n)
   3923             cur[img_n] = 255; // first pixel
   3924          raw += img_n;
   3925          cur += out_n;
   3926          prior += out_n;
   3927       } else if (depth == 16) {
   3928          if (img_n != out_n) {
   3929             cur[filter_bytes]   = 255; // first pixel top byte
   3930             cur[filter_bytes+1] = 255; // first pixel bottom byte
   3931          }
   3932          raw += filter_bytes;
   3933          cur += output_bytes;
   3934          prior += output_bytes;
   3935       } else {
   3936          raw += 1;
   3937          cur += 1;
   3938          prior += 1;
   3939       }
   3940 
   3941       // this is a little gross, so that we don't switch per-pixel or per-component
   3942       if (depth < 8 || img_n == out_n) {
   3943          int nk = (width - 1)*filter_bytes;
   3944          #define STBI__CASE(f) \
   3945              case f:     \
   3946                 for (k=0; k < nk; ++k)
   3947          switch (filter) {
   3948             // "none" filter turns into a memcpy here; make that explicit.
   3949             case STBI__F_none:         memcpy(cur, raw, nk); break;
   3950             STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
   3951             STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
   3952             STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
   3953             STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
   3954             STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
   3955             STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
   3956          }
   3957          #undef STBI__CASE
   3958          raw += nk;
   3959       } else {
   3960          STBI_ASSERT(img_n+1 == out_n);
   3961          #define STBI__CASE(f) \
   3962              case f:     \
   3963                 for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
   3964                    for (k=0; k < filter_bytes; ++k)
   3965          switch (filter) {
   3966             STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
   3967             STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
   3968             STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
   3969             STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
   3970             STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
   3971             STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
   3972             STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
   3973          }
   3974          #undef STBI__CASE
   3975 
   3976          // the loop above sets the high byte of the pixels' alpha, but for
   3977          // 16 bit png files we also need the low byte set. we'll do that here.
   3978          if (depth == 16) {
   3979             cur = a->out + stride*j; // start at the beginning of the row again
   3980             for (i=0; i < x; ++i,cur+=output_bytes) {
   3981                cur[filter_bytes+1] = 255;
   3982             }
   3983          }
   3984       }
   3985    }
   3986 
   3987    // we make a separate pass to expand bits to pixels; for performance,
   3988    // this could run two scanlines behind the above code, so it won't
   3989    // intefere with filtering but will still be in the cache.
   3990    if (depth < 8) {
   3991       for (j=0; j < y; ++j) {
   3992          stbi_uc *cur = a->out + stride*j;
   3993          stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
   3994          // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
   3995          // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
   3996          stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
   3997 
   3998          // note that the final byte might overshoot and write more data than desired.
   3999          // we can allocate enough data that this never writes out of memory, but it
   4000          // could also overwrite the next scanline. can it overwrite non-empty data
   4001          // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
   4002          // so we need to explicitly clamp the final ones
   4003 
   4004          if (depth == 4) {
   4005             for (k=x*img_n; k >= 2; k-=2, ++in) {
   4006                *cur++ = scale * ((*in >> 4)       );
   4007                *cur++ = scale * ((*in     ) & 0x0f);
   4008             }
   4009             if (k > 0) *cur++ = scale * ((*in >> 4)       );
   4010          } else if (depth == 2) {
   4011             for (k=x*img_n; k >= 4; k-=4, ++in) {
   4012                *cur++ = scale * ((*in >> 6)       );
   4013                *cur++ = scale * ((*in >> 4) & 0x03);
   4014                *cur++ = scale * ((*in >> 2) & 0x03);
   4015                *cur++ = scale * ((*in     ) & 0x03);
   4016             }
   4017             if (k > 0) *cur++ = scale * ((*in >> 6)       );
   4018             if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
   4019             if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
   4020          } else if (depth == 1) {
   4021             for (k=x*img_n; k >= 8; k-=8, ++in) {
   4022                *cur++ = scale * ((*in >> 7)       );
   4023                *cur++ = scale * ((*in >> 6) & 0x01);
   4024                *cur++ = scale * ((*in >> 5) & 0x01);
   4025                *cur++ = scale * ((*in >> 4) & 0x01);
   4026                *cur++ = scale * ((*in >> 3) & 0x01);
   4027                *cur++ = scale * ((*in >> 2) & 0x01);
   4028                *cur++ = scale * ((*in >> 1) & 0x01);
   4029                *cur++ = scale * ((*in     ) & 0x01);
   4030             }
   4031             if (k > 0) *cur++ = scale * ((*in >> 7)       );
   4032             if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
   4033             if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
   4034             if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
   4035             if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
   4036             if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
   4037             if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
   4038          }
   4039          if (img_n != out_n) {
   4040             int q;
   4041             // insert alpha = 255
   4042             cur = a->out + stride*j;
   4043             if (img_n == 1) {
   4044                for (q=x-1; q >= 0; --q) {
   4045                   cur[q*2+1] = 255;
   4046                   cur[q*2+0] = cur[q];
   4047                }
   4048             } else {
   4049                STBI_ASSERT(img_n == 3);
   4050                for (q=x-1; q >= 0; --q) {
   4051                   cur[q*4+3] = 255;
   4052                   cur[q*4+2] = cur[q*3+2];
   4053                   cur[q*4+1] = cur[q*3+1];
   4054                   cur[q*4+0] = cur[q*3+0];
   4055                }
   4056             }
   4057          }
   4058       }
   4059    } else if (depth == 16) {
   4060       // force the image data from big-endian to platform-native.
   4061       // this is done in a separate pass due to the decoding relying
   4062       // on the data being untouched, but could probably be done
   4063       // per-line during decode if care is taken.
   4064       stbi_uc *cur = a->out;
   4065       stbi__uint16 *cur16 = (stbi__uint16*)cur;
   4066 
   4067       for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
   4068          *cur16 = (cur[0] << 8) | cur[1];
   4069       }
   4070    }
   4071 
   4072    return 1;
   4073 }
   4074 
   4075 static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
   4076 {
   4077    int bytes = (depth == 16 ? 2 : 1);
   4078    int out_bytes = out_n * bytes;
   4079    stbi_uc *final;
   4080    int p;
   4081    if (!interlaced)
   4082       return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
   4083 
   4084    // de-interlacing
   4085    final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
   4086    for (p=0; p < 7; ++p) {
   4087       int xorig[] = { 0,4,0,2,0,1,0 };
   4088       int yorig[] = { 0,0,4,0,2,0,1 };
   4089       int xspc[]  = { 8,8,4,4,2,2,1 };
   4090       int yspc[]  = { 8,8,8,4,4,2,2 };
   4091       int i,j,x,y;
   4092       // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
   4093       x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
   4094       y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
   4095       if (x && y) {
   4096          stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
   4097          if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
   4098             STBI_FREE(final);
   4099             return 0;
   4100          }
   4101          for (j=0; j < y; ++j) {
   4102             for (i=0; i < x; ++i) {
   4103                int out_y = j*yspc[p]+yorig[p];
   4104                int out_x = i*xspc[p]+xorig[p];
   4105                memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
   4106                       a->out + (j*x+i)*out_bytes, out_bytes);
   4107             }
   4108          }
   4109          STBI_FREE(a->out);
   4110          image_data += img_len;
   4111          image_data_len -= img_len;
   4112       }
   4113    }
   4114    a->out = final;
   4115 
   4116    return 1;
   4117 }
   4118 
   4119 static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
   4120 {
   4121    stbi__context *s = z->s;
   4122    stbi__uint32 i, pixel_count = s->img_x * s->img_y;
   4123    stbi_uc *p = z->out;
   4124 
   4125    // compute color-based transparency, assuming we've
   4126    // already got 255 as the alpha value in the output
   4127    STBI_ASSERT(out_n == 2 || out_n == 4);
   4128 
   4129    if (out_n == 2) {
   4130       for (i=0; i < pixel_count; ++i) {
   4131          p[1] = (p[0] == tc[0] ? 0 : 255);
   4132          p += 2;
   4133       }
   4134    } else {
   4135       for (i=0; i < pixel_count; ++i) {
   4136          if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
   4137             p[3] = 0;
   4138          p += 4;
   4139       }
   4140    }
   4141    return 1;
   4142 }
   4143 
   4144 static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
   4145 {
   4146    stbi__context *s = z->s;
   4147    stbi__uint32 i, pixel_count = s->img_x * s->img_y;
   4148    stbi__uint16 *p = (stbi__uint16*) z->out;
   4149 
   4150    // compute color-based transparency, assuming we've
   4151    // already got 65535 as the alpha value in the output
   4152    STBI_ASSERT(out_n == 2 || out_n == 4);
   4153 
   4154    if (out_n == 2) {
   4155       for (i = 0; i < pixel_count; ++i) {
   4156          p[1] = (p[0] == tc[0] ? 0 : 65535);
   4157          p += 2;
   4158       }
   4159    } else {
   4160       for (i = 0; i < pixel_count; ++i) {
   4161          if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
   4162             p[3] = 0;
   4163          p += 4;
   4164       }
   4165    }
   4166    return 1;
   4167 }
   4168 
   4169 static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
   4170 {
   4171    stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
   4172    stbi_uc *p, *temp_out, *orig = a->out;
   4173 
   4174    p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
   4175    if (p == NULL) return stbi__err("outofmem", "Out of memory");
   4176 
   4177    // between here and free(out) below, exitting would leak
   4178    temp_out = p;
   4179 
   4180    if (pal_img_n == 3) {
   4181       for (i=0; i < pixel_count; ++i) {
   4182          int n = orig[i]*4;
   4183          p[0] = palette[n  ];
   4184          p[1] = palette[n+1];
   4185          p[2] = palette[n+2];
   4186          p += 3;
   4187       }
   4188    } else {
   4189       for (i=0; i < pixel_count; ++i) {
   4190          int n = orig[i]*4;
   4191          p[0] = palette[n  ];
   4192          p[1] = palette[n+1];
   4193          p[2] = palette[n+2];
   4194          p[3] = palette[n+3];
   4195          p += 4;
   4196       }
   4197    }
   4198    STBI_FREE(a->out);
   4199    a->out = temp_out;
   4200 
   4201    STBI_NOTUSED(len);
   4202 
   4203    return 1;
   4204 }
   4205 
   4206 static int stbi__unpremultiply_on_load = 0;
   4207 static int stbi__de_iphone_flag = 0;
   4208 
   4209 STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
   4210 {
   4211    stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
   4212 }
   4213 
   4214 STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
   4215 {
   4216    stbi__de_iphone_flag = flag_true_if_should_convert;
   4217 }
   4218 
   4219 static void stbi__de_iphone(stbi__png *z)
   4220 {
   4221    stbi__context *s = z->s;
   4222    stbi__uint32 i, pixel_count = s->img_x * s->img_y;
   4223    stbi_uc *p = z->out;
   4224 
   4225    if (s->img_out_n == 3) {  // convert bgr to rgb
   4226       for (i=0; i < pixel_count; ++i) {
   4227          stbi_uc t = p[0];
   4228          p[0] = p[2];
   4229          p[2] = t;
   4230          p += 3;
   4231       }
   4232    } else {
   4233       STBI_ASSERT(s->img_out_n == 4);
   4234       if (stbi__unpremultiply_on_load) {
   4235          // convert bgr to rgb and unpremultiply
   4236          for (i=0; i < pixel_count; ++i) {
   4237             stbi_uc a = p[3];
   4238             stbi_uc t = p[0];
   4239             if (a) {
   4240                stbi_uc half = a / 2;
   4241                p[0] = (p[2] * 255 + half) / a;
   4242                p[1] = (p[1] * 255 + half) / a;
   4243                p[2] = ( t   * 255 + half) / a;
   4244             } else {
   4245                p[0] = p[2];
   4246                p[2] = t;
   4247             }
   4248             p += 4;
   4249          }
   4250       } else {
   4251          // convert bgr to rgb
   4252          for (i=0; i < pixel_count; ++i) {
   4253             stbi_uc t = p[0];
   4254             p[0] = p[2];
   4255             p[2] = t;
   4256             p += 4;
   4257          }
   4258       }
   4259    }
   4260 }
   4261 
   4262 #define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
   4263 
   4264 static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
   4265 {
   4266    stbi_uc palette[1024], pal_img_n=0;
   4267    stbi_uc has_trans=0, tc[3];
   4268    stbi__uint16 tc16[3];
   4269    stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
   4270    int first=1,k,interlace=0, color=0, is_iphone=0;
   4271    stbi__context *s = z->s;
   4272 
   4273    z->expanded = NULL;
   4274    z->idata = NULL;
   4275    z->out = NULL;
   4276 
   4277    if (!stbi__check_png_header(s)) return 0;
   4278 
   4279    if (scan == STBI__SCAN_type) return 1;
   4280 
   4281    for (;;) {
   4282       stbi__pngchunk c = stbi__get_chunk_header(s);
   4283       switch (c.type) {
   4284          case STBI__PNG_TYPE('C','g','B','I'):
   4285             is_iphone = 1;
   4286             stbi__skip(s, c.length);
   4287             break;
   4288          case STBI__PNG_TYPE('I','H','D','R'): {
   4289             int comp,filter;
   4290             if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
   4291             first = 0;
   4292             if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
   4293             s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
   4294             s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
   4295             z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
   4296             color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
   4297             if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
   4298             if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
   4299             comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
   4300             filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
   4301             interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
   4302             if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
   4303             if (!pal_img_n) {
   4304                s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
   4305                if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
   4306                if (scan == STBI__SCAN_header) return 1;
   4307             } else {
   4308                // if paletted, then pal_n is our final components, and
   4309                // img_n is # components to decompress/filter.
   4310                s->img_n = 1;
   4311                if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
   4312                // if SCAN_header, have to scan to see if we have a tRNS
   4313             }
   4314             break;
   4315          }
   4316 
   4317          case STBI__PNG_TYPE('P','L','T','E'):  {
   4318             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
   4319             if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
   4320             pal_len = c.length / 3;
   4321             if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
   4322             for (i=0; i < pal_len; ++i) {
   4323                palette[i*4+0] = stbi__get8(s);
   4324                palette[i*4+1] = stbi__get8(s);
   4325                palette[i*4+2] = stbi__get8(s);
   4326                palette[i*4+3] = 255;
   4327             }
   4328             break;
   4329          }
   4330 
   4331          case STBI__PNG_TYPE('t','R','N','S'): {
   4332             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
   4333             if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
   4334             if (pal_img_n) {
   4335                if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
   4336                if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
   4337                if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
   4338                pal_img_n = 4;
   4339                for (i=0; i < c.length; ++i)
   4340                   palette[i*4+3] = stbi__get8(s);
   4341             } else {
   4342                if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
   4343                if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
   4344                has_trans = 1;
   4345                if (z->depth == 16) {
   4346                   for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
   4347                } else {
   4348                   for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
   4349                }
   4350             }
   4351             break;
   4352          }
   4353 
   4354          case STBI__PNG_TYPE('I','D','A','T'): {
   4355             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
   4356             if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
   4357             if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
   4358             if ((int)(ioff + c.length) < (int)ioff) return 0;
   4359             if (ioff + c.length > idata_limit) {
   4360                stbi__uint32 idata_limit_old = idata_limit;
   4361                stbi_uc *p;
   4362                if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
   4363                while (ioff + c.length > idata_limit)
   4364                   idata_limit *= 2;
   4365                STBI_NOTUSED(idata_limit_old);
   4366                p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
   4367                z->idata = p;
   4368             }
   4369             if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
   4370             ioff += c.length;
   4371             break;
   4372          }
   4373 
   4374          case STBI__PNG_TYPE('I','E','N','D'): {
   4375             stbi__uint32 raw_len, bpl;
   4376             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
   4377             if (scan != STBI__SCAN_load) return 1;
   4378             if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
   4379             // initial guess for decoded data size to avoid unnecessary reallocs
   4380             bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
   4381             raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
   4382             z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
   4383             if (z->expanded == NULL) return 0; // zlib should set error
   4384             STBI_FREE(z->idata); z->idata = NULL;
   4385             if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
   4386                s->img_out_n = s->img_n+1;
   4387             else
   4388                s->img_out_n = s->img_n;
   4389             if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
   4390             if (has_trans) {
   4391                if (z->depth == 16) {
   4392                   if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
   4393                } else {
   4394                   if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
   4395                }
   4396             }
   4397             if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
   4398                stbi__de_iphone(z);
   4399             if (pal_img_n) {
   4400                // pal_img_n == 3 or 4
   4401                s->img_n = pal_img_n; // record the actual colors we had
   4402                s->img_out_n = pal_img_n;
   4403                if (req_comp >= 3) s->img_out_n = req_comp;
   4404                if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
   4405                   return 0;
   4406             } else if (has_trans) {
   4407                // non-paletted image with tRNS -> source image has (constant) alpha
   4408                ++s->img_n;
   4409             }
   4410             STBI_FREE(z->expanded); z->expanded = NULL;
   4411             return 1;
   4412          }
   4413 
   4414          default:
   4415             // if critical, fail
   4416             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
   4417             if ((c.type & (1 << 29)) == 0) {
   4418                #ifndef STBI_NO_FAILURE_STRINGS
   4419                // not threadsafe
   4420                static char invalid_chunk[] = "XXXX PNG chunk not known";
   4421                invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
   4422                invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
   4423                invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
   4424                invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
   4425                #endif
   4426                return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
   4427             }
   4428             stbi__skip(s, c.length);
   4429             break;
   4430       }
   4431       // end of PNG chunk, read and skip CRC
   4432       stbi__get32be(s);
   4433    }
   4434 }
   4435 
   4436 static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
   4437 {
   4438    void *result=NULL;
   4439    if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
   4440    if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
   4441       if (p->depth < 8)
   4442          ri->bits_per_channel = 8;
   4443       else
   4444          ri->bits_per_channel = p->depth;
   4445       result = p->out;
   4446       p->out = NULL;
   4447       if (req_comp && req_comp != p->s->img_out_n) {
   4448          if (ri->bits_per_channel == 8)
   4449             result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
   4450          else
   4451             result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
   4452          p->s->img_out_n = req_comp;
   4453          if (result == NULL) return result;
   4454       }
   4455       *x = p->s->img_x;
   4456       *y = p->s->img_y;
   4457       if (n) *n = p->s->img_n;
   4458    }
   4459    STBI_FREE(p->out);      p->out      = NULL;
   4460    STBI_FREE(p->expanded); p->expanded = NULL;
   4461    STBI_FREE(p->idata);    p->idata    = NULL;
   4462 
   4463    return result;
   4464 }
   4465 
   4466 static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
   4467 {
   4468    stbi__png p;
   4469    p.s = s;
   4470    return stbi__do_png(&p, x,y,comp,req_comp, ri);
   4471 }
   4472 
   4473 static int stbi__png_test(stbi__context *s)
   4474 {
   4475    int r;
   4476    r = stbi__check_png_header(s);
   4477    stbi__rewind(s);
   4478    return r;
   4479 }
   4480 
   4481 static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
   4482 {
   4483    if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
   4484       stbi__rewind( p->s );
   4485       return 0;
   4486    }
   4487    if (x) *x = p->s->img_x;
   4488    if (y) *y = p->s->img_y;
   4489    if (comp) *comp = p->s->img_n;
   4490    return 1;
   4491 }
   4492 
   4493 static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
   4494 {
   4495    stbi__png p;
   4496    p.s = s;
   4497    return stbi__png_info_raw(&p, x, y, comp);
   4498 }
   4499 
   4500 static int stbi__png_is16(stbi__context *s)
   4501 {
   4502    stbi__png p;
   4503    p.s = s;
   4504    if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
   4505 	   return 0;
   4506    if (p.depth != 16) {
   4507       stbi__rewind(p.s);
   4508       return 0;
   4509    }
   4510    return 1;
   4511 }
   4512 #endif
   4513 
   4514 // Microsoft/Windows BMP image
   4515 
   4516 #ifndef STBI_NO_BMP
   4517 static int stbi__bmp_test_raw(stbi__context *s)
   4518 {
   4519    int r;
   4520    int sz;
   4521    if (stbi__get8(s) != 'B') return 0;
   4522    if (stbi__get8(s) != 'M') return 0;
   4523    stbi__get32le(s); // discard filesize
   4524    stbi__get16le(s); // discard reserved
   4525    stbi__get16le(s); // discard reserved
   4526    stbi__get32le(s); // discard data offset
   4527    sz = stbi__get32le(s);
   4528    r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
   4529    return r;
   4530 }
   4531 
   4532 static int stbi__bmp_test(stbi__context *s)
   4533 {
   4534    int r = stbi__bmp_test_raw(s);
   4535    stbi__rewind(s);
   4536    return r;
   4537 }
   4538 
   4539 
   4540 // returns 0..31 for the highest set bit
   4541 static int stbi__high_bit(unsigned int z)
   4542 {
   4543    int n=0;
   4544    if (z == 0) return -1;
   4545    if (z >= 0x10000) n += 16, z >>= 16;
   4546    if (z >= 0x00100) n +=  8, z >>=  8;
   4547    if (z >= 0x00010) n +=  4, z >>=  4;
   4548    if (z >= 0x00004) n +=  2, z >>=  2;
   4549    if (z >= 0x00002) n +=  1, z >>=  1;
   4550    return n;
   4551 }
   4552 
   4553 static int stbi__bitcount(unsigned int a)
   4554 {
   4555    a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
   4556    a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
   4557    a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
   4558    a = (a + (a >> 8)); // max 16 per 8 bits
   4559    a = (a + (a >> 16)); // max 32 per 8 bits
   4560    return a & 0xff;
   4561 }
   4562 
   4563 // extract an arbitrarily-aligned N-bit value (N=bits)
   4564 // from v, and then make it 8-bits long and fractionally
   4565 // extend it to full full range.
   4566 static int stbi__shiftsigned(int v, int shift, int bits)
   4567 {
   4568    static unsigned int mul_table[9] = {
   4569       0,
   4570       0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
   4571       0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
   4572    };
   4573    static unsigned int shift_table[9] = {
   4574       0, 0,0,1,0,2,4,6,0,
   4575    };
   4576    if (shift < 0)
   4577       v <<= -shift;
   4578    else
   4579       v >>= shift;
   4580    STBI_ASSERT(v >= 0 && v < 256);
   4581    v >>= (8-bits);
   4582    STBI_ASSERT(bits >= 0 && bits <= 8);
   4583    return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
   4584 }
   4585 
   4586 typedef struct
   4587 {
   4588    int bpp, offset, hsz;
   4589    unsigned int mr,mg,mb,ma, all_a;
   4590 } stbi__bmp_data;
   4591 
   4592 static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
   4593 {
   4594    int hsz;
   4595    if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
   4596    stbi__get32le(s); // discard filesize
   4597    stbi__get16le(s); // discard reserved
   4598    stbi__get16le(s); // discard reserved
   4599    info->offset = stbi__get32le(s);
   4600    info->hsz = hsz = stbi__get32le(s);
   4601    info->mr = info->mg = info->mb = info->ma = 0;
   4602 
   4603    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
   4604    if (hsz == 12) {
   4605       s->img_x = stbi__get16le(s);
   4606       s->img_y = stbi__get16le(s);
   4607    } else {
   4608       s->img_x = stbi__get32le(s);
   4609       s->img_y = stbi__get32le(s);
   4610    }
   4611    if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
   4612    info->bpp = stbi__get16le(s);
   4613    if (hsz != 12) {
   4614       int compress = stbi__get32le(s);
   4615       if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
   4616       stbi__get32le(s); // discard sizeof
   4617       stbi__get32le(s); // discard hres
   4618       stbi__get32le(s); // discard vres
   4619       stbi__get32le(s); // discard colorsused
   4620       stbi__get32le(s); // discard max important
   4621       if (hsz == 40 || hsz == 56) {
   4622          if (hsz == 56) {
   4623             stbi__get32le(s);
   4624             stbi__get32le(s);
   4625             stbi__get32le(s);
   4626             stbi__get32le(s);
   4627          }
   4628          if (info->bpp == 16 || info->bpp == 32) {
   4629             if (compress == 0) {
   4630                if (info->bpp == 32) {
   4631                   info->mr = 0xffu << 16;
   4632                   info->mg = 0xffu <<  8;
   4633                   info->mb = 0xffu <<  0;
   4634                   info->ma = 0xffu << 24;
   4635                   info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
   4636                } else {
   4637                   info->mr = 31u << 10;
   4638                   info->mg = 31u <<  5;
   4639                   info->mb = 31u <<  0;
   4640                }
   4641             } else if (compress == 3) {
   4642                info->mr = stbi__get32le(s);
   4643                info->mg = stbi__get32le(s);
   4644                info->mb = stbi__get32le(s);
   4645                // not documented, but generated by photoshop and handled by mspaint
   4646                if (info->mr == info->mg && info->mg == info->mb) {
   4647                   // ?!?!?
   4648                   return stbi__errpuc("bad BMP", "bad BMP");
   4649                }
   4650             } else
   4651                return stbi__errpuc("bad BMP", "bad BMP");
   4652          }
   4653       } else {
   4654          int i;
   4655          if (hsz != 108 && hsz != 124)
   4656             return stbi__errpuc("bad BMP", "bad BMP");
   4657          info->mr = stbi__get32le(s);
   4658          info->mg = stbi__get32le(s);
   4659          info->mb = stbi__get32le(s);
   4660          info->ma = stbi__get32le(s);
   4661          stbi__get32le(s); // discard color space
   4662          for (i=0; i < 12; ++i)
   4663             stbi__get32le(s); // discard color space parameters
   4664          if (hsz == 124) {
   4665             stbi__get32le(s); // discard rendering intent
   4666             stbi__get32le(s); // discard offset of profile data
   4667             stbi__get32le(s); // discard size of profile data
   4668             stbi__get32le(s); // discard reserved
   4669          }
   4670       }
   4671    }
   4672    return (void *) 1;
   4673 }
   4674 
   4675 
   4676 static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
   4677 {
   4678    stbi_uc *out;
   4679    unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
   4680    stbi_uc pal[256][4];
   4681    int psize=0,i,j,width;
   4682    int flip_vertically, pad, target;
   4683    stbi__bmp_data info;
   4684    STBI_NOTUSED(ri);
   4685 
   4686    info.all_a = 255;
   4687    if (stbi__bmp_parse_header(s, &info) == NULL)
   4688       return NULL; // error code already set
   4689 
   4690    flip_vertically = ((int) s->img_y) > 0;
   4691    s->img_y = abs((int) s->img_y);
   4692 
   4693    mr = info.mr;
   4694    mg = info.mg;
   4695    mb = info.mb;
   4696    ma = info.ma;
   4697    all_a = info.all_a;
   4698 
   4699    if (info.hsz == 12) {
   4700       if (info.bpp < 24)
   4701          psize = (info.offset - 14 - 24) / 3;
   4702    } else {
   4703       if (info.bpp < 16)
   4704          psize = (info.offset - 14 - info.hsz) >> 2;
   4705    }
   4706 
   4707    s->img_n = ma ? 4 : 3;
   4708    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
   4709       target = req_comp;
   4710    else
   4711       target = s->img_n; // if they want monochrome, we'll post-convert
   4712 
   4713    // sanity-check size
   4714    if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
   4715       return stbi__errpuc("too large", "Corrupt BMP");
   4716 
   4717    out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
   4718    if (!out) return stbi__errpuc("outofmem", "Out of memory");
   4719    if (info.bpp < 16) {
   4720       int z=0;
   4721       if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
   4722       for (i=0; i < psize; ++i) {
   4723          pal[i][2] = stbi__get8(s);
   4724          pal[i][1] = stbi__get8(s);
   4725          pal[i][0] = stbi__get8(s);
   4726          if (info.hsz != 12) stbi__get8(s);
   4727          pal[i][3] = 255;
   4728       }
   4729       stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
   4730       if (info.bpp == 1) width = (s->img_x + 7) >> 3;
   4731       else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
   4732       else if (info.bpp == 8) width = s->img_x;
   4733       else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
   4734       pad = (-width)&3;
   4735       if (info.bpp == 1) {
   4736          for (j=0; j < (int) s->img_y; ++j) {
   4737             int bit_offset = 7, v = stbi__get8(s);
   4738             for (i=0; i < (int) s->img_x; ++i) {
   4739                int color = (v>>bit_offset)&0x1;
   4740                out[z++] = pal[color][0];
   4741                out[z++] = pal[color][1];
   4742                out[z++] = pal[color][2];
   4743                if((--bit_offset) < 0) {
   4744                   bit_offset = 7;
   4745                   v = stbi__get8(s);
   4746                }
   4747             }
   4748             stbi__skip(s, pad);
   4749          }
   4750       } else {
   4751          for (j=0; j < (int) s->img_y; ++j) {
   4752             for (i=0; i < (int) s->img_x; i += 2) {
   4753                int v=stbi__get8(s),v2=0;
   4754                if (info.bpp == 4) {
   4755                   v2 = v & 15;
   4756                   v >>= 4;
   4757                }
   4758                out[z++] = pal[v][0];
   4759                out[z++] = pal[v][1];
   4760                out[z++] = pal[v][2];
   4761                if (target == 4) out[z++] = 255;
   4762                if (i+1 == (int) s->img_x) break;
   4763                v = (info.bpp == 8) ? stbi__get8(s) : v2;
   4764                out[z++] = pal[v][0];
   4765                out[z++] = pal[v][1];
   4766                out[z++] = pal[v][2];
   4767                if (target == 4) out[z++] = 255;
   4768             }
   4769             stbi__skip(s, pad);
   4770          }
   4771       }
   4772    } else {
   4773       int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
   4774       int z = 0;
   4775       int easy=0;
   4776       stbi__skip(s, info.offset - 14 - info.hsz);
   4777       if (info.bpp == 24) width = 3 * s->img_x;
   4778       else if (info.bpp == 16) width = 2*s->img_x;
   4779       else /* bpp = 32 and pad = 0 */ width=0;
   4780       pad = (-width) & 3;
   4781       if (info.bpp == 24) {
   4782          easy = 1;
   4783       } else if (info.bpp == 32) {
   4784          if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
   4785             easy = 2;
   4786       }
   4787       if (!easy) {
   4788          if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
   4789          // right shift amt to put high bit in position #7
   4790          rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
   4791          gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
   4792          bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
   4793          ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
   4794       }
   4795       for (j=0; j < (int) s->img_y; ++j) {
   4796          if (easy) {
   4797             for (i=0; i < (int) s->img_x; ++i) {
   4798                unsigned char a;
   4799                out[z+2] = stbi__get8(s);
   4800                out[z+1] = stbi__get8(s);
   4801                out[z+0] = stbi__get8(s);
   4802                z += 3;
   4803                a = (easy == 2 ? stbi__get8(s) : 255);
   4804                all_a |= a;
   4805                if (target == 4) out[z++] = a;
   4806             }
   4807          } else {
   4808             int bpp = info.bpp;
   4809             for (i=0; i < (int) s->img_x; ++i) {
   4810                stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
   4811                unsigned int a;
   4812                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
   4813                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
   4814                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
   4815                a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
   4816                all_a |= a;
   4817                if (target == 4) out[z++] = STBI__BYTECAST(a);
   4818             }
   4819          }
   4820          stbi__skip(s, pad);
   4821       }
   4822    }
   4823 
   4824    // if alpha channel is all 0s, replace with all 255s
   4825    if (target == 4 && all_a == 0)
   4826       for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
   4827          out[i] = 255;
   4828 
   4829    if (flip_vertically) {
   4830       stbi_uc t;
   4831       for (j=0; j < (int) s->img_y>>1; ++j) {
   4832          stbi_uc *p1 = out +      j     *s->img_x*target;
   4833          stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
   4834          for (i=0; i < (int) s->img_x*target; ++i) {
   4835             t = p1[i], p1[i] = p2[i], p2[i] = t;
   4836          }
   4837       }
   4838    }
   4839 
   4840    if (req_comp && req_comp != target) {
   4841       out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
   4842       if (out == NULL) return out; // stbi__convert_format frees input on failure
   4843    }
   4844 
   4845    *x = s->img_x;
   4846    *y = s->img_y;
   4847    if (comp) *comp = s->img_n;
   4848    return out;
   4849 }
   4850 #endif
   4851 
   4852 // Targa Truevision - TGA
   4853 // by Jonathan Dummer
   4854 #ifndef STBI_NO_TGA
   4855 // returns STBI_rgb or whatever, 0 on error
   4856 static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
   4857 {
   4858    // only RGB or RGBA (incl. 16bit) or grey allowed
   4859    if (is_rgb16) *is_rgb16 = 0;
   4860    switch(bits_per_pixel) {
   4861       case 8:  return STBI_grey;
   4862       case 16: if(is_grey) return STBI_grey_alpha;
   4863                // fallthrough
   4864       case 15: if(is_rgb16) *is_rgb16 = 1;
   4865                return STBI_rgb;
   4866       case 24: // fallthrough
   4867       case 32: return bits_per_pixel/8;
   4868       default: return 0;
   4869    }
   4870 }
   4871 
   4872 static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
   4873 {
   4874     int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
   4875     int sz, tga_colormap_type;
   4876     stbi__get8(s);                   // discard Offset
   4877     tga_colormap_type = stbi__get8(s); // colormap type
   4878     if( tga_colormap_type > 1 ) {
   4879         stbi__rewind(s);
   4880         return 0;      // only RGB or indexed allowed
   4881     }
   4882     tga_image_type = stbi__get8(s); // image type
   4883     if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
   4884         if (tga_image_type != 1 && tga_image_type != 9) {
   4885             stbi__rewind(s);
   4886             return 0;
   4887         }
   4888         stbi__skip(s,4);       // skip index of first colormap entry and number of entries
   4889         sz = stbi__get8(s);    //   check bits per palette color entry
   4890         if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
   4891             stbi__rewind(s);
   4892             return 0;
   4893         }
   4894         stbi__skip(s,4);       // skip image x and y origin
   4895         tga_colormap_bpp = sz;
   4896     } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
   4897         if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
   4898             stbi__rewind(s);
   4899             return 0; // only RGB or grey allowed, +/- RLE
   4900         }
   4901         stbi__skip(s,9); // skip colormap specification and image x/y origin
   4902         tga_colormap_bpp = 0;
   4903     }
   4904     tga_w = stbi__get16le(s);
   4905     if( tga_w < 1 ) {
   4906         stbi__rewind(s);
   4907         return 0;   // test width
   4908     }
   4909     tga_h = stbi__get16le(s);
   4910     if( tga_h < 1 ) {
   4911         stbi__rewind(s);
   4912         return 0;   // test height
   4913     }
   4914     tga_bits_per_pixel = stbi__get8(s); // bits per pixel
   4915     stbi__get8(s); // ignore alpha bits
   4916     if (tga_colormap_bpp != 0) {
   4917         if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
   4918             // when using a colormap, tga_bits_per_pixel is the size of the indexes
   4919             // I don't think anything but 8 or 16bit indexes makes sense
   4920             stbi__rewind(s);
   4921             return 0;
   4922         }
   4923         tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
   4924     } else {
   4925         tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
   4926     }
   4927     if(!tga_comp) {
   4928       stbi__rewind(s);
   4929       return 0;
   4930     }
   4931     if (x) *x = tga_w;
   4932     if (y) *y = tga_h;
   4933     if (comp) *comp = tga_comp;
   4934     return 1;                   // seems to have passed everything
   4935 }
   4936 
   4937 static int stbi__tga_test(stbi__context *s)
   4938 {
   4939    int res = 0;
   4940    int sz, tga_color_type;
   4941    stbi__get8(s);      //   discard Offset
   4942    tga_color_type = stbi__get8(s);   //   color type
   4943    if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
   4944    sz = stbi__get8(s);   //   image type
   4945    if ( tga_color_type == 1 ) { // colormapped (paletted) image
   4946       if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
   4947       stbi__skip(s,4);       // skip index of first colormap entry and number of entries
   4948       sz = stbi__get8(s);    //   check bits per palette color entry
   4949       if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
   4950       stbi__skip(s,4);       // skip image x and y origin
   4951    } else { // "normal" image w/o colormap
   4952       if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
   4953       stbi__skip(s,9); // skip colormap specification and image x/y origin
   4954    }
   4955    if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
   4956    if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
   4957    sz = stbi__get8(s);   //   bits per pixel
   4958    if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
   4959    if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
   4960 
   4961    res = 1; // if we got this far, everything's good and we can return 1 instead of 0
   4962 
   4963 errorEnd:
   4964    stbi__rewind(s);
   4965    return res;
   4966 }
   4967 
   4968 // read 16bit value and convert to 24bit RGB
   4969 static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
   4970 {
   4971    stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
   4972    stbi__uint16 fiveBitMask = 31;
   4973    // we have 3 channels with 5bits each
   4974    int r = (px >> 10) & fiveBitMask;
   4975    int g = (px >> 5) & fiveBitMask;
   4976    int b = px & fiveBitMask;
   4977    // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
   4978    out[0] = (stbi_uc)((r * 255)/31);
   4979    out[1] = (stbi_uc)((g * 255)/31);
   4980    out[2] = (stbi_uc)((b * 255)/31);
   4981 
   4982    // some people claim that the most significant bit might be used for alpha
   4983    // (possibly if an alpha-bit is set in the "image descriptor byte")
   4984    // but that only made 16bit test images completely translucent..
   4985    // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
   4986 }
   4987 
   4988 static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
   4989 {
   4990    //   read in the TGA header stuff
   4991    int tga_offset = stbi__get8(s);
   4992    int tga_indexed = stbi__get8(s);
   4993    int tga_image_type = stbi__get8(s);
   4994    int tga_is_RLE = 0;
   4995    int tga_palette_start = stbi__get16le(s);
   4996    int tga_palette_len = stbi__get16le(s);
   4997    int tga_palette_bits = stbi__get8(s);
   4998    int tga_x_origin = stbi__get16le(s);
   4999    int tga_y_origin = stbi__get16le(s);
   5000    int tga_width = stbi__get16le(s);
   5001    int tga_height = stbi__get16le(s);
   5002    int tga_bits_per_pixel = stbi__get8(s);
   5003    int tga_comp, tga_rgb16=0;
   5004    int tga_inverted = stbi__get8(s);
   5005    // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
   5006    //   image data
   5007    unsigned char *tga_data;
   5008    unsigned char *tga_palette = NULL;
   5009    int i, j;
   5010    unsigned char raw_data[4] = {0};
   5011    int RLE_count = 0;
   5012    int RLE_repeating = 0;
   5013    int read_next_pixel = 1;
   5014    STBI_NOTUSED(ri);
   5015 
   5016    //   do a tiny bit of precessing
   5017    if ( tga_image_type >= 8 )
   5018    {
   5019       tga_image_type -= 8;
   5020       tga_is_RLE = 1;
   5021    }
   5022    tga_inverted = 1 - ((tga_inverted >> 5) & 1);
   5023 
   5024    //   If I'm paletted, then I'll use the number of bits from the palette
   5025    if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
   5026    else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
   5027 
   5028    if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
   5029       return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
   5030 
   5031    //   tga info
   5032    *x = tga_width;
   5033    *y = tga_height;
   5034    if (comp) *comp = tga_comp;
   5035 
   5036    if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
   5037       return stbi__errpuc("too large", "Corrupt TGA");
   5038 
   5039    tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
   5040    if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
   5041 
   5042    // skip to the data's starting position (offset usually = 0)
   5043    stbi__skip(s, tga_offset );
   5044 
   5045    if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
   5046       for (i=0; i < tga_height; ++i) {
   5047          int row = tga_inverted ? tga_height -i - 1 : i;
   5048          stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
   5049          stbi__getn(s, tga_row, tga_width * tga_comp);
   5050       }
   5051    } else  {
   5052       //   do I need to load a palette?
   5053       if ( tga_indexed)
   5054       {
   5055          //   any data to skip? (offset usually = 0)
   5056          stbi__skip(s, tga_palette_start );
   5057          //   load the palette
   5058          tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
   5059          if (!tga_palette) {
   5060             STBI_FREE(tga_data);
   5061             return stbi__errpuc("outofmem", "Out of memory");
   5062          }
   5063          if (tga_rgb16) {
   5064             stbi_uc *pal_entry = tga_palette;
   5065             STBI_ASSERT(tga_comp == STBI_rgb);
   5066             for (i=0; i < tga_palette_len; ++i) {
   5067                stbi__tga_read_rgb16(s, pal_entry);
   5068                pal_entry += tga_comp;
   5069             }
   5070          } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
   5071                STBI_FREE(tga_data);
   5072                STBI_FREE(tga_palette);
   5073                return stbi__errpuc("bad palette", "Corrupt TGA");
   5074          }
   5075       }
   5076       //   load the data
   5077       for (i=0; i < tga_width * tga_height; ++i)
   5078       {
   5079          //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
   5080          if ( tga_is_RLE )
   5081          {
   5082             if ( RLE_count == 0 )
   5083             {
   5084                //   yep, get the next byte as a RLE command
   5085                int RLE_cmd = stbi__get8(s);
   5086                RLE_count = 1 + (RLE_cmd & 127);
   5087                RLE_repeating = RLE_cmd >> 7;
   5088                read_next_pixel = 1;
   5089             } else if ( !RLE_repeating )
   5090             {
   5091                read_next_pixel = 1;
   5092             }
   5093          } else
   5094          {
   5095             read_next_pixel = 1;
   5096          }
   5097          //   OK, if I need to read a pixel, do it now
   5098          if ( read_next_pixel )
   5099          {
   5100             //   load however much data we did have
   5101             if ( tga_indexed )
   5102             {
   5103                // read in index, then perform the lookup
   5104                int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
   5105                if ( pal_idx >= tga_palette_len ) {
   5106                   // invalid index
   5107                   pal_idx = 0;
   5108                }
   5109                pal_idx *= tga_comp;
   5110                for (j = 0; j < tga_comp; ++j) {
   5111                   raw_data[j] = tga_palette[pal_idx+j];
   5112                }
   5113             } else if(tga_rgb16) {
   5114                STBI_ASSERT(tga_comp == STBI_rgb);
   5115                stbi__tga_read_rgb16(s, raw_data);
   5116             } else {
   5117                //   read in the data raw
   5118                for (j = 0; j < tga_comp; ++j) {
   5119                   raw_data[j] = stbi__get8(s);
   5120                }
   5121             }
   5122             //   clear the reading flag for the next pixel
   5123             read_next_pixel = 0;
   5124          } // end of reading a pixel
   5125 
   5126          // copy data
   5127          for (j = 0; j < tga_comp; ++j)
   5128            tga_data[i*tga_comp+j] = raw_data[j];
   5129 
   5130          //   in case we're in RLE mode, keep counting down
   5131          --RLE_count;
   5132       }
   5133       //   do I need to invert the image?
   5134       if ( tga_inverted )
   5135       {
   5136          for (j = 0; j*2 < tga_height; ++j)
   5137          {
   5138             int index1 = j * tga_width * tga_comp;
   5139             int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
   5140             for (i = tga_width * tga_comp; i > 0; --i)
   5141             {
   5142                unsigned char temp = tga_data[index1];
   5143                tga_data[index1] = tga_data[index2];
   5144                tga_data[index2] = temp;
   5145                ++index1;
   5146                ++index2;
   5147             }
   5148          }
   5149       }
   5150       //   clear my palette, if I had one
   5151       if ( tga_palette != NULL )
   5152       {
   5153          STBI_FREE( tga_palette );
   5154       }
   5155    }
   5156 
   5157    // swap RGB - if the source data was RGB16, it already is in the right order
   5158    if (tga_comp >= 3 && !tga_rgb16)
   5159    {
   5160       unsigned char* tga_pixel = tga_data;
   5161       for (i=0; i < tga_width * tga_height; ++i)
   5162       {
   5163          unsigned char temp = tga_pixel[0];
   5164          tga_pixel[0] = tga_pixel[2];
   5165          tga_pixel[2] = temp;
   5166          tga_pixel += tga_comp;
   5167       }
   5168    }
   5169 
   5170    // convert to target component count
   5171    if (req_comp && req_comp != tga_comp)
   5172       tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
   5173 
   5174    //   the things I do to get rid of an error message, and yet keep
   5175    //   Microsoft's C compilers happy... [8^(
   5176    tga_palette_start = tga_palette_len = tga_palette_bits =
   5177          tga_x_origin = tga_y_origin = 0;
   5178    //   OK, done
   5179    return tga_data;
   5180 }
   5181 #endif
   5182 
   5183 // *************************************************************************************************
   5184 // Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
   5185 
   5186 #ifndef STBI_NO_PSD
   5187 static int stbi__psd_test(stbi__context *s)
   5188 {
   5189    int r = (stbi__get32be(s) == 0x38425053);
   5190    stbi__rewind(s);
   5191    return r;
   5192 }
   5193 
   5194 static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
   5195 {
   5196    int count, nleft, len;
   5197 
   5198    count = 0;
   5199    while ((nleft = pixelCount - count) > 0) {
   5200       len = stbi__get8(s);
   5201       if (len == 128) {
   5202          // No-op.
   5203       } else if (len < 128) {
   5204          // Copy next len+1 bytes literally.
   5205          len++;
   5206          if (len > nleft) return 0; // corrupt data
   5207          count += len;
   5208          while (len) {
   5209             *p = stbi__get8(s);
   5210             p += 4;
   5211             len--;
   5212          }
   5213       } else if (len > 128) {
   5214          stbi_uc   val;
   5215          // Next -len+1 bytes in the dest are replicated from next source byte.
   5216          // (Interpret len as a negative 8-bit int.)
   5217          len = 257 - len;
   5218          if (len > nleft) return 0; // corrupt data
   5219          val = stbi__get8(s);
   5220          count += len;
   5221          while (len) {
   5222             *p = val;
   5223             p += 4;
   5224             len--;
   5225          }
   5226       }
   5227    }
   5228 
   5229    return 1;
   5230 }
   5231 
   5232 static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
   5233 {
   5234    int pixelCount;
   5235    int channelCount, compression;
   5236    int channel, i;
   5237    int bitdepth;
   5238    int w,h;
   5239    stbi_uc *out;
   5240    STBI_NOTUSED(ri);
   5241 
   5242    // Check identifier
   5243    if (stbi__get32be(s) != 0x38425053)   // "8BPS"
   5244       return stbi__errpuc("not PSD", "Corrupt PSD image");
   5245 
   5246    // Check file type version.
   5247    if (stbi__get16be(s) != 1)
   5248       return stbi__errpuc("wrong version", "Unsupported version of PSD image");
   5249 
   5250    // Skip 6 reserved bytes.
   5251    stbi__skip(s, 6 );
   5252 
   5253    // Read the number of channels (R, G, B, A, etc).
   5254    channelCount = stbi__get16be(s);
   5255    if (channelCount < 0 || channelCount > 16)
   5256       return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
   5257 
   5258    // Read the rows and columns of the image.
   5259    h = stbi__get32be(s);
   5260    w = stbi__get32be(s);
   5261 
   5262    // Make sure the depth is 8 bits.
   5263    bitdepth = stbi__get16be(s);
   5264    if (bitdepth != 8 && bitdepth != 16)
   5265       return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
   5266 
   5267    // Make sure the color mode is RGB.
   5268    // Valid options are:
   5269    //   0: Bitmap
   5270    //   1: Grayscale
   5271    //   2: Indexed color
   5272    //   3: RGB color
   5273    //   4: CMYK color
   5274    //   7: Multichannel
   5275    //   8: Duotone
   5276    //   9: Lab color
   5277    if (stbi__get16be(s) != 3)
   5278       return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
   5279 
   5280    // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
   5281    stbi__skip(s,stbi__get32be(s) );
   5282 
   5283    // Skip the image resources.  (resolution, pen tool paths, etc)
   5284    stbi__skip(s, stbi__get32be(s) );
   5285 
   5286    // Skip the reserved data.
   5287    stbi__skip(s, stbi__get32be(s) );
   5288 
   5289    // Find out if the data is compressed.
   5290    // Known values:
   5291    //   0: no compression
   5292    //   1: RLE compressed
   5293    compression = stbi__get16be(s);
   5294    if (compression > 1)
   5295       return stbi__errpuc("bad compression", "PSD has an unknown compression format");
   5296 
   5297    // Check size
   5298    if (!stbi__mad3sizes_valid(4, w, h, 0))
   5299       return stbi__errpuc("too large", "Corrupt PSD");
   5300 
   5301    // Create the destination image.
   5302 
   5303    if (!compression && bitdepth == 16 && bpc == 16) {
   5304       out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
   5305       ri->bits_per_channel = 16;
   5306    } else
   5307       out = (stbi_uc *) stbi__malloc(4 * w*h);
   5308 
   5309    if (!out) return stbi__errpuc("outofmem", "Out of memory");
   5310    pixelCount = w*h;
   5311 
   5312    // Initialize the data to zero.
   5313    //memset( out, 0, pixelCount * 4 );
   5314 
   5315    // Finally, the image data.
   5316    if (compression) {
   5317       // RLE as used by .PSD and .TIFF
   5318       // Loop until you get the number of unpacked bytes you are expecting:
   5319       //     Read the next source byte into n.
   5320       //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
   5321       //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
   5322       //     Else if n is 128, noop.
   5323       // Endloop
   5324 
   5325       // The RLE-compressed data is preceeded by a 2-byte data count for each row in the data,
   5326       // which we're going to just skip.
   5327       stbi__skip(s, h * channelCount * 2 );
   5328 
   5329       // Read the RLE data by channel.
   5330       for (channel = 0; channel < 4; channel++) {
   5331          stbi_uc *p;
   5332 
   5333          p = out+channel;
   5334          if (channel >= channelCount) {
   5335             // Fill this channel with default data.
   5336             for (i = 0; i < pixelCount; i++, p += 4)
   5337                *p = (channel == 3 ? 255 : 0);
   5338          } else {
   5339             // Read the RLE data.
   5340             if (!stbi__psd_decode_rle(s, p, pixelCount)) {
   5341                STBI_FREE(out);
   5342                return stbi__errpuc("corrupt", "bad RLE data");
   5343             }
   5344          }
   5345       }
   5346 
   5347    } else {
   5348       // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
   5349       // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
   5350 
   5351       // Read the data by channel.
   5352       for (channel = 0; channel < 4; channel++) {
   5353          if (channel >= channelCount) {
   5354             // Fill this channel with default data.
   5355             if (bitdepth == 16 && bpc == 16) {
   5356                stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
   5357                stbi__uint16 val = channel == 3 ? 65535 : 0;
   5358                for (i = 0; i < pixelCount; i++, q += 4)
   5359                   *q = val;
   5360             } else {
   5361                stbi_uc *p = out+channel;
   5362                stbi_uc val = channel == 3 ? 255 : 0;
   5363                for (i = 0; i < pixelCount; i++, p += 4)
   5364                   *p = val;
   5365             }
   5366          } else {
   5367             if (ri->bits_per_channel == 16) {    // output bpc
   5368                stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
   5369                for (i = 0; i < pixelCount; i++, q += 4)
   5370                   *q = (stbi__uint16) stbi__get16be(s);
   5371             } else {
   5372                stbi_uc *p = out+channel;
   5373                if (bitdepth == 16) {  // input bpc
   5374                   for (i = 0; i < pixelCount; i++, p += 4)
   5375                      *p = (stbi_uc) (stbi__get16be(s) >> 8);
   5376                } else {
   5377                   for (i = 0; i < pixelCount; i++, p += 4)
   5378                      *p = stbi__get8(s);
   5379                }
   5380             }
   5381          }
   5382       }
   5383    }
   5384 
   5385    // remove weird white matte from PSD
   5386    if (channelCount >= 4) {
   5387       if (ri->bits_per_channel == 16) {
   5388          for (i=0; i < w*h; ++i) {
   5389             stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
   5390             if (pixel[3] != 0 && pixel[3] != 65535) {
   5391                float a = pixel[3] / 65535.0f;
   5392                float ra = 1.0f / a;
   5393                float inv_a = 65535.0f * (1 - ra);
   5394                pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
   5395                pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
   5396                pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
   5397             }
   5398          }
   5399       } else {
   5400          for (i=0; i < w*h; ++i) {
   5401             unsigned char *pixel = out + 4*i;
   5402             if (pixel[3] != 0 && pixel[3] != 255) {
   5403                float a = pixel[3] / 255.0f;
   5404                float ra = 1.0f / a;
   5405                float inv_a = 255.0f * (1 - ra);
   5406                pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
   5407                pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
   5408                pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
   5409             }
   5410          }
   5411       }
   5412    }
   5413 
   5414    // convert to desired output format
   5415    if (req_comp && req_comp != 4) {
   5416       if (ri->bits_per_channel == 16)
   5417          out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
   5418       else
   5419          out = stbi__convert_format(out, 4, req_comp, w, h);
   5420       if (out == NULL) return out; // stbi__convert_format frees input on failure
   5421    }
   5422 
   5423    if (comp) *comp = 4;
   5424    *y = h;
   5425    *x = w;
   5426 
   5427    return out;
   5428 }
   5429 #endif
   5430 
   5431 // *************************************************************************************************
   5432 // Softimage PIC loader
   5433 // by Tom Seddon
   5434 //
   5435 // See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
   5436 // See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
   5437 
   5438 #ifndef STBI_NO_PIC
   5439 static int stbi__pic_is4(stbi__context *s,const char *str)
   5440 {
   5441    int i;
   5442    for (i=0; i<4; ++i)
   5443       if (stbi__get8(s) != (stbi_uc)str[i])
   5444          return 0;
   5445 
   5446    return 1;
   5447 }
   5448 
   5449 static int stbi__pic_test_core(stbi__context *s)
   5450 {
   5451    int i;
   5452 
   5453    if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
   5454       return 0;
   5455 
   5456    for(i=0;i<84;++i)
   5457       stbi__get8(s);
   5458 
   5459    if (!stbi__pic_is4(s,"PICT"))
   5460       return 0;
   5461 
   5462    return 1;
   5463 }
   5464 
   5465 typedef struct
   5466 {
   5467    stbi_uc size,type,channel;
   5468 } stbi__pic_packet;
   5469 
   5470 static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
   5471 {
   5472    int mask=0x80, i;
   5473 
   5474    for (i=0; i<4; ++i, mask>>=1) {
   5475       if (channel & mask) {
   5476          if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
   5477          dest[i]=stbi__get8(s);
   5478       }
   5479    }
   5480 
   5481    return dest;
   5482 }
   5483 
   5484 static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
   5485 {
   5486    int mask=0x80,i;
   5487 
   5488    for (i=0;i<4; ++i, mask>>=1)
   5489       if (channel&mask)
   5490          dest[i]=src[i];
   5491 }
   5492 
   5493 static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
   5494 {
   5495    int act_comp=0,num_packets=0,y,chained;
   5496    stbi__pic_packet packets[10];
   5497 
   5498    // this will (should...) cater for even some bizarre stuff like having data
   5499     // for the same channel in multiple packets.
   5500    do {
   5501       stbi__pic_packet *packet;
   5502 
   5503       if (num_packets==sizeof(packets)/sizeof(packets[0]))
   5504          return stbi__errpuc("bad format","too many packets");
   5505 
   5506       packet = &packets[num_packets++];
   5507 
   5508       chained = stbi__get8(s);
   5509       packet->size    = stbi__get8(s);
   5510       packet->type    = stbi__get8(s);
   5511       packet->channel = stbi__get8(s);
   5512 
   5513       act_comp |= packet->channel;
   5514 
   5515       if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
   5516       if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
   5517    } while (chained);
   5518 
   5519    *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
   5520 
   5521    for(y=0; y<height; ++y) {
   5522       int packet_idx;
   5523 
   5524       for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
   5525          stbi__pic_packet *packet = &packets[packet_idx];
   5526          stbi_uc *dest = result+y*width*4;
   5527 
   5528          switch (packet->type) {
   5529             default:
   5530                return stbi__errpuc("bad format","packet has bad compression type");
   5531 
   5532             case 0: {//uncompressed
   5533                int x;
   5534 
   5535                for(x=0;x<width;++x, dest+=4)
   5536                   if (!stbi__readval(s,packet->channel,dest))
   5537                      return 0;
   5538                break;
   5539             }
   5540 
   5541             case 1://Pure RLE
   5542                {
   5543                   int left=width, i;
   5544 
   5545                   while (left>0) {
   5546                      stbi_uc count,value[4];
   5547 
   5548                      count=stbi__get8(s);
   5549                      if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
   5550 
   5551                      if (count > left)
   5552                         count = (stbi_uc) left;
   5553 
   5554                      if (!stbi__readval(s,packet->channel,value))  return 0;
   5555 
   5556                      for(i=0; i<count; ++i,dest+=4)
   5557                         stbi__copyval(packet->channel,dest,value);
   5558                      left -= count;
   5559                   }
   5560                }
   5561                break;
   5562 
   5563             case 2: {//Mixed RLE
   5564                int left=width;
   5565                while (left>0) {
   5566                   int count = stbi__get8(s), i;
   5567                   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
   5568 
   5569                   if (count >= 128) { // Repeated
   5570                      stbi_uc value[4];
   5571 
   5572                      if (count==128)
   5573                         count = stbi__get16be(s);
   5574                      else
   5575                         count -= 127;
   5576                      if (count > left)
   5577                         return stbi__errpuc("bad file","scanline overrun");
   5578 
   5579                      if (!stbi__readval(s,packet->channel,value))
   5580                         return 0;
   5581 
   5582                      for(i=0;i<count;++i, dest += 4)
   5583                         stbi__copyval(packet->channel,dest,value);
   5584                   } else { // Raw
   5585                      ++count;
   5586                      if (count>left) return stbi__errpuc("bad file","scanline overrun");
   5587 
   5588                      for(i=0;i<count;++i, dest+=4)
   5589                         if (!stbi__readval(s,packet->channel,dest))
   5590                            return 0;
   5591                   }
   5592                   left-=count;
   5593                }
   5594                break;
   5595             }
   5596          }
   5597       }
   5598    }
   5599 
   5600    return result;
   5601 }
   5602 
   5603 static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
   5604 {
   5605    stbi_uc *result;
   5606    int i, x,y, internal_comp;
   5607    STBI_NOTUSED(ri);
   5608 
   5609    if (!comp) comp = &internal_comp;
   5610 
   5611    for (i=0; i<92; ++i)
   5612       stbi__get8(s);
   5613 
   5614    x = stbi__get16be(s);
   5615    y = stbi__get16be(s);
   5616    if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
   5617    if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
   5618 
   5619    stbi__get32be(s); //skip `ratio'
   5620    stbi__get16be(s); //skip `fields'
   5621    stbi__get16be(s); //skip `pad'
   5622 
   5623    // intermediate buffer is RGBA
   5624    result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
   5625    memset(result, 0xff, x*y*4);
   5626 
   5627    if (!stbi__pic_load_core(s,x,y,comp, result)) {
   5628       STBI_FREE(result);
   5629       result=0;
   5630    }
   5631    *px = x;
   5632    *py = y;
   5633    if (req_comp == 0) req_comp = *comp;
   5634    result=stbi__convert_format(result,4,req_comp,x,y);
   5635 
   5636    return result;
   5637 }
   5638 
   5639 static int stbi__pic_test(stbi__context *s)
   5640 {
   5641    int r = stbi__pic_test_core(s);
   5642    stbi__rewind(s);
   5643    return r;
   5644 }
   5645 #endif
   5646 
   5647 // *************************************************************************************************
   5648 // GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
   5649 
   5650 #ifndef STBI_NO_GIF
   5651 typedef struct
   5652 {
   5653    stbi__int16 prefix;
   5654    stbi_uc first;
   5655    stbi_uc suffix;
   5656 } stbi__gif_lzw;
   5657 
   5658 typedef struct
   5659 {
   5660    int w,h;
   5661    stbi_uc *out;                 // output buffer (always 4 components)
   5662    stbi_uc *background;          // The current "background" as far as a gif is concerned
   5663    stbi_uc *history; 
   5664    int flags, bgindex, ratio, transparent, eflags;
   5665    stbi_uc  pal[256][4];
   5666    stbi_uc lpal[256][4];
   5667    stbi__gif_lzw codes[8192];
   5668    stbi_uc *color_table;
   5669    int parse, step;
   5670    int lflags;
   5671    int start_x, start_y;
   5672    int max_x, max_y;
   5673    int cur_x, cur_y;
   5674    int line_size;
   5675    int delay;
   5676 } stbi__gif;
   5677 
   5678 static int stbi__gif_test_raw(stbi__context *s)
   5679 {
   5680    int sz;
   5681    if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
   5682    sz = stbi__get8(s);
   5683    if (sz != '9' && sz != '7') return 0;
   5684    if (stbi__get8(s) != 'a') return 0;
   5685    return 1;
   5686 }
   5687 
   5688 static int stbi__gif_test(stbi__context *s)
   5689 {
   5690    int r = stbi__gif_test_raw(s);
   5691    stbi__rewind(s);
   5692    return r;
   5693 }
   5694 
   5695 static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
   5696 {
   5697    int i;
   5698    for (i=0; i < num_entries; ++i) {
   5699       pal[i][2] = stbi__get8(s);
   5700       pal[i][1] = stbi__get8(s);
   5701       pal[i][0] = stbi__get8(s);
   5702       pal[i][3] = transp == i ? 0 : 255;
   5703    }
   5704 }
   5705 
   5706 static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
   5707 {
   5708    stbi_uc version;
   5709    if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
   5710       return stbi__err("not GIF", "Corrupt GIF");
   5711 
   5712    version = stbi__get8(s);
   5713    if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
   5714    if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
   5715 
   5716    stbi__g_failure_reason = "";
   5717    g->w = stbi__get16le(s);
   5718    g->h = stbi__get16le(s);
   5719    g->flags = stbi__get8(s);
   5720    g->bgindex = stbi__get8(s);
   5721    g->ratio = stbi__get8(s);
   5722    g->transparent = -1;
   5723 
   5724    if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
   5725 
   5726    if (is_info) return 1;
   5727 
   5728    if (g->flags & 0x80)
   5729       stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
   5730 
   5731    return 1;
   5732 }
   5733 
   5734 static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
   5735 {
   5736    stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
   5737    if (!stbi__gif_header(s, g, comp, 1)) {
   5738       STBI_FREE(g);
   5739       stbi__rewind( s );
   5740       return 0;
   5741    }
   5742    if (x) *x = g->w;
   5743    if (y) *y = g->h;
   5744    STBI_FREE(g);
   5745    return 1;
   5746 }
   5747 
   5748 static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
   5749 {
   5750    stbi_uc *p, *c;
   5751    int idx; 
   5752 
   5753    // recurse to decode the prefixes, since the linked-list is backwards,
   5754    // and working backwards through an interleaved image would be nasty
   5755    if (g->codes[code].prefix >= 0)
   5756       stbi__out_gif_code(g, g->codes[code].prefix);
   5757 
   5758    if (g->cur_y >= g->max_y) return;
   5759 
   5760    idx = g->cur_x + g->cur_y; 
   5761    p = &g->out[idx];
   5762    g->history[idx / 4] = 1;  
   5763 
   5764    c = &g->color_table[g->codes[code].suffix * 4];
   5765    if (c[3] > 128) { // don't render transparent pixels; 
   5766       p[0] = c[2];
   5767       p[1] = c[1];
   5768       p[2] = c[0];
   5769       p[3] = c[3];
   5770    }
   5771    g->cur_x += 4;
   5772 
   5773    if (g->cur_x >= g->max_x) {
   5774       g->cur_x = g->start_x;
   5775       g->cur_y += g->step;
   5776 
   5777       while (g->cur_y >= g->max_y && g->parse > 0) {
   5778          g->step = (1 << g->parse) * g->line_size;
   5779          g->cur_y = g->start_y + (g->step >> 1);
   5780          --g->parse;
   5781       }
   5782    }
   5783 }
   5784 
   5785 static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
   5786 {
   5787    stbi_uc lzw_cs;
   5788    stbi__int32 len, init_code;
   5789    stbi__uint32 first;
   5790    stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
   5791    stbi__gif_lzw *p;
   5792 
   5793    lzw_cs = stbi__get8(s);
   5794    if (lzw_cs > 12) return NULL;
   5795    clear = 1 << lzw_cs;
   5796    first = 1;
   5797    codesize = lzw_cs + 1;
   5798    codemask = (1 << codesize) - 1;
   5799    bits = 0;
   5800    valid_bits = 0;
   5801    for (init_code = 0; init_code < clear; init_code++) {
   5802       g->codes[init_code].prefix = -1;
   5803       g->codes[init_code].first = (stbi_uc) init_code;
   5804       g->codes[init_code].suffix = (stbi_uc) init_code;
   5805    }
   5806 
   5807    // support no starting clear code
   5808    avail = clear+2;
   5809    oldcode = -1;
   5810 
   5811    len = 0;
   5812    for(;;) {
   5813       if (valid_bits < codesize) {
   5814          if (len == 0) {
   5815             len = stbi__get8(s); // start new block
   5816             if (len == 0)
   5817                return g->out;
   5818          }
   5819          --len;
   5820          bits |= (stbi__int32) stbi__get8(s) << valid_bits;
   5821          valid_bits += 8;
   5822       } else {
   5823          stbi__int32 code = bits & codemask;
   5824          bits >>= codesize;
   5825          valid_bits -= codesize;
   5826          // @OPTIMIZE: is there some way we can accelerate the non-clear path?
   5827          if (code == clear) {  // clear code
   5828             codesize = lzw_cs + 1;
   5829             codemask = (1 << codesize) - 1;
   5830             avail = clear + 2;
   5831             oldcode = -1;
   5832             first = 0;
   5833          } else if (code == clear + 1) { // end of stream code
   5834             stbi__skip(s, len);
   5835             while ((len = stbi__get8(s)) > 0)
   5836                stbi__skip(s,len);
   5837             return g->out;
   5838          } else if (code <= avail) {
   5839             if (first) {
   5840                return stbi__errpuc("no clear code", "Corrupt GIF");
   5841             }
   5842 
   5843             if (oldcode >= 0) {
   5844                p = &g->codes[avail++];
   5845                if (avail > 8192) {
   5846                   return stbi__errpuc("too many codes", "Corrupt GIF");
   5847                }
   5848 
   5849                p->prefix = (stbi__int16) oldcode;
   5850                p->first = g->codes[oldcode].first;
   5851                p->suffix = (code == avail) ? p->first : g->codes[code].first;
   5852             } else if (code == avail)
   5853                return stbi__errpuc("illegal code in raster", "Corrupt GIF");
   5854 
   5855             stbi__out_gif_code(g, (stbi__uint16) code);
   5856 
   5857             if ((avail & codemask) == 0 && avail <= 0x0FFF) {
   5858                codesize++;
   5859                codemask = (1 << codesize) - 1;
   5860             }
   5861 
   5862             oldcode = code;
   5863          } else {
   5864             return stbi__errpuc("illegal code in raster", "Corrupt GIF");
   5865          }
   5866       }
   5867    }
   5868 }
   5869 
   5870 // this function is designed to support animated gifs, although stb_image doesn't support it
   5871 // two back is the image from two frames ago, used for a very specific disposal format
   5872 static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
   5873 {
   5874    int dispose; 
   5875    int first_frame; 
   5876    int pi; 
   5877    int pcount; 
   5878 
   5879    // on first frame, any non-written pixels get the background colour (non-transparent)
   5880    first_frame = 0; 
   5881    if (g->out == 0) {
   5882       if (!stbi__gif_header(s, g, comp,0))     return 0; // stbi__g_failure_reason set by stbi__gif_header
   5883       g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
   5884       g->background = (stbi_uc *) stbi__malloc(4 * g->w * g->h); 
   5885       g->history = (stbi_uc *) stbi__malloc(g->w * g->h); 
   5886       if (g->out == 0)                      return stbi__errpuc("outofmem", "Out of memory");
   5887 
   5888       // image is treated as "tranparent" at the start - ie, nothing overwrites the current background; 
   5889       // background colour is only used for pixels that are not rendered first frame, after that "background"
   5890       // color refers to teh color that was there the previous frame. 
   5891       memset( g->out, 0x00, 4 * g->w * g->h ); 
   5892       memset( g->background, 0x00, 4 * g->w * g->h ); // state of the background (starts transparent)
   5893       memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
   5894       first_frame = 1; 
   5895    } else {
   5896       // second frame - how do we dispoase of the previous one?
   5897       dispose = (g->eflags & 0x1C) >> 2; 
   5898       pcount = g->w * g->h; 
   5899 
   5900       if ((dispose == 3) && (two_back == 0)) {
   5901          dispose = 2; // if I don't have an image to revert back to, default to the old background
   5902       }
   5903 
   5904       if (dispose == 3) { // use previous graphic
   5905          for (pi = 0; pi < pcount; ++pi) {
   5906             if (g->history[pi]) {
   5907                memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 ); 
   5908             }
   5909          }
   5910       } else if (dispose == 2) { 
   5911          // restore what was changed last frame to background before that frame; 
   5912          for (pi = 0; pi < pcount; ++pi) {
   5913             if (g->history[pi]) {
   5914                memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 ); 
   5915             }
   5916          }
   5917       } else {
   5918          // This is a non-disposal case eithe way, so just 
   5919          // leave the pixels as is, and they will become the new background
   5920          // 1: do not dispose
   5921          // 0:  not specified.
   5922       }
   5923 
   5924       // background is what out is after the undoing of the previou frame; 
   5925       memcpy( g->background, g->out, 4 * g->w * g->h ); 
   5926    }
   5927 
   5928    // clear my history; 
   5929    memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
   5930 
   5931    for (;;) {
   5932       int tag = stbi__get8(s); 
   5933       switch (tag) {
   5934          case 0x2C: /* Image Descriptor */
   5935          {
   5936             stbi__int32 x, y, w, h;
   5937             stbi_uc *o;
   5938 
   5939             x = stbi__get16le(s);
   5940             y = stbi__get16le(s);
   5941             w = stbi__get16le(s);
   5942             h = stbi__get16le(s);
   5943             if (((x + w) > (g->w)) || ((y + h) > (g->h)))
   5944                return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
   5945 
   5946             g->line_size = g->w * 4;
   5947             g->start_x = x * 4;
   5948             g->start_y = y * g->line_size;
   5949             g->max_x   = g->start_x + w * 4;
   5950             g->max_y   = g->start_y + h * g->line_size;
   5951             g->cur_x   = g->start_x;
   5952             g->cur_y   = g->start_y;
   5953 
   5954             g->lflags = stbi__get8(s);
   5955 
   5956             if (g->lflags & 0x40) {
   5957                g->step = 8 * g->line_size; // first interlaced spacing
   5958                g->parse = 3;
   5959             } else {
   5960                g->step = g->line_size;
   5961                g->parse = 0;
   5962             }
   5963 
   5964             if (g->lflags & 0x80) {
   5965                stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
   5966                g->color_table = (stbi_uc *) g->lpal;
   5967             } else if (g->flags & 0x80) {
   5968                g->color_table = (stbi_uc *) g->pal;
   5969             } else
   5970                return stbi__errpuc("missing color table", "Corrupt GIF");            
   5971             
   5972             o = stbi__process_gif_raster(s, g);
   5973             if (o == NULL) return NULL;
   5974 
   5975             // if this was the first frame, 
   5976             pcount = g->w * g->h; 
   5977             if (first_frame && (g->bgindex > 0)) {
   5978                // if first frame, any pixel not drawn to gets the background color
   5979                for (pi = 0; pi < pcount; ++pi) {
   5980                   if (g->history[pi] == 0) {
   5981                      g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be; 
   5982                      memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 ); 
   5983                   }
   5984                }
   5985             }
   5986 
   5987             return o;
   5988          }
   5989 
   5990          case 0x21: // Comment Extension.
   5991          {
   5992             int len;
   5993             int ext = stbi__get8(s); 
   5994             if (ext == 0xF9) { // Graphic Control Extension.
   5995                len = stbi__get8(s);
   5996                if (len == 4) {
   5997                   g->eflags = stbi__get8(s);
   5998                   g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
   5999 
   6000                   // unset old transparent
   6001                   if (g->transparent >= 0) {
   6002                      g->pal[g->transparent][3] = 255; 
   6003                   } 
   6004                   if (g->eflags & 0x01) {
   6005                      g->transparent = stbi__get8(s);
   6006                      if (g->transparent >= 0) {
   6007                         g->pal[g->transparent][3] = 0; 
   6008                      }
   6009                   } else {
   6010                      // don't need transparent
   6011                      stbi__skip(s, 1); 
   6012                      g->transparent = -1; 
   6013                   }
   6014                } else {
   6015                   stbi__skip(s, len);
   6016                   break;
   6017                }
   6018             } 
   6019             while ((len = stbi__get8(s)) != 0) {
   6020                stbi__skip(s, len);
   6021             }
   6022             break;
   6023          }
   6024 
   6025          case 0x3B: // gif stream termination code
   6026             return (stbi_uc *) s; // using '1' causes warning on some compilers
   6027 
   6028          default:
   6029             return stbi__errpuc("unknown code", "Corrupt GIF");
   6030       }
   6031    }
   6032 }
   6033 
   6034 static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
   6035 {
   6036    if (stbi__gif_test(s)) {
   6037       int layers = 0; 
   6038       stbi_uc *u = 0;
   6039       stbi_uc *out = 0;
   6040       stbi_uc *two_back = 0; 
   6041       stbi__gif g;
   6042       int stride; 
   6043       memset(&g, 0, sizeof(g));
   6044       if (delays) {
   6045          *delays = 0; 
   6046       }
   6047 
   6048       do {
   6049          u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
   6050          if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
   6051 
   6052          if (u) {
   6053             *x = g.w;
   6054             *y = g.h;
   6055             ++layers; 
   6056             stride = g.w * g.h * 4; 
   6057          
   6058             if (out) {
   6059                out = (stbi_uc*) STBI_REALLOC( out, layers * stride ); 
   6060                if (delays) {
   6061                   *delays = (int*) STBI_REALLOC( *delays, sizeof(int) * layers ); 
   6062                }
   6063             } else {
   6064                out = (stbi_uc*)stbi__malloc( layers * stride ); 
   6065                if (delays) {
   6066                   *delays = (int*) stbi__malloc( layers * sizeof(int) ); 
   6067                }
   6068             }
   6069             memcpy( out + ((layers - 1) * stride), u, stride ); 
   6070             if (layers >= 2) {
   6071                two_back = out - 2 * stride; 
   6072             }
   6073 
   6074             if (delays) {
   6075                (*delays)[layers - 1U] = g.delay; 
   6076             }
   6077          }
   6078       } while (u != 0); 
   6079 
   6080       // free temp buffer; 
   6081       STBI_FREE(g.out); 
   6082       STBI_FREE(g.history); 
   6083       STBI_FREE(g.background); 
   6084 
   6085       // do the final conversion after loading everything; 
   6086       if (req_comp && req_comp != 4)
   6087          out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
   6088 
   6089       *z = layers; 
   6090       return out;
   6091    } else {
   6092       return stbi__errpuc("not GIF", "Image was not as a gif type."); 
   6093    }
   6094 }
   6095 
   6096 static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
   6097 {
   6098    stbi_uc *u = 0;
   6099    stbi__gif g;
   6100    memset(&g, 0, sizeof(g));
   6101 
   6102    u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
   6103    if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
   6104    if (u) {
   6105       *x = g.w;
   6106       *y = g.h;
   6107 
   6108       // moved conversion to after successful load so that the same
   6109       // can be done for multiple frames. 
   6110       if (req_comp && req_comp != 4)
   6111          u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
   6112    }
   6113 
   6114    // free buffers needed for multiple frame loading; 
   6115    STBI_FREE(g.history);
   6116    STBI_FREE(g.background); 
   6117 
   6118    return u;
   6119 }
   6120 
   6121 static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
   6122 {
   6123    return stbi__gif_info_raw(s,x,y,comp);
   6124 }
   6125 #endif
   6126 
   6127 // *************************************************************************************************
   6128 // Radiance RGBE HDR loader
   6129 // originally by Nicolas Schulz
   6130 #ifndef STBI_NO_HDR
   6131 static int stbi__hdr_test_core(stbi__context *s, const char *signature)
   6132 {
   6133    int i;
   6134    for (i=0; signature[i]; ++i)
   6135       if (stbi__get8(s) != signature[i])
   6136           return 0;
   6137    stbi__rewind(s);
   6138    return 1;
   6139 }
   6140 
   6141 static int stbi__hdr_test(stbi__context* s)
   6142 {
   6143    int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
   6144    stbi__rewind(s);
   6145    if(!r) {
   6146        r = stbi__hdr_test_core(s, "#?RGBE\n");
   6147        stbi__rewind(s);
   6148    }
   6149    return r;
   6150 }
   6151 
   6152 #define STBI__HDR_BUFLEN  1024
   6153 static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
   6154 {
   6155    int len=0;
   6156    char c = '\0';
   6157 
   6158    c = (char) stbi__get8(z);
   6159 
   6160    while (!stbi__at_eof(z) && c != '\n') {
   6161       buffer[len++] = c;
   6162       if (len == STBI__HDR_BUFLEN-1) {
   6163          // flush to end of line
   6164          while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
   6165             ;
   6166          break;
   6167       }
   6168       c = (char) stbi__get8(z);
   6169    }
   6170 
   6171    buffer[len] = 0;
   6172    return buffer;
   6173 }
   6174 
   6175 static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
   6176 {
   6177    if ( input[3] != 0 ) {
   6178       float f1;
   6179       // Exponent
   6180       f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
   6181       if (req_comp <= 2)
   6182          output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
   6183       else {
   6184          output[0] = input[0] * f1;
   6185          output[1] = input[1] * f1;
   6186          output[2] = input[2] * f1;
   6187       }
   6188       if (req_comp == 2) output[1] = 1;
   6189       if (req_comp == 4) output[3] = 1;
   6190    } else {
   6191       switch (req_comp) {
   6192          case 4: output[3] = 1; /* fallthrough */
   6193          case 3: output[0] = output[1] = output[2] = 0;
   6194                  break;
   6195          case 2: output[1] = 1; /* fallthrough */
   6196          case 1: output[0] = 0;
   6197                  break;
   6198       }
   6199    }
   6200 }
   6201 
   6202 static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
   6203 {
   6204    char buffer[STBI__HDR_BUFLEN];
   6205    char *token;
   6206    int valid = 0;
   6207    int width, height;
   6208    stbi_uc *scanline;
   6209    float *hdr_data;
   6210    int len;
   6211    unsigned char count, value;
   6212    int i, j, k, c1,c2, z;
   6213    const char *headerToken;
   6214    STBI_NOTUSED(ri);
   6215 
   6216    // Check identifier
   6217    headerToken = stbi__hdr_gettoken(s,buffer);
   6218    if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
   6219       return stbi__errpf("not HDR", "Corrupt HDR image");
   6220 
   6221    // Parse header
   6222    for(;;) {
   6223       token = stbi__hdr_gettoken(s,buffer);
   6224       if (token[0] == 0) break;
   6225       if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
   6226    }
   6227 
   6228    if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
   6229 
   6230    // Parse width and height
   6231    // can't use sscanf() if we're not using stdio!
   6232    token = stbi__hdr_gettoken(s,buffer);
   6233    if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
   6234    token += 3;
   6235    height = (int) strtol(token, &token, 10);
   6236    while (*token == ' ') ++token;
   6237    if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
   6238    token += 3;
   6239    width = (int) strtol(token, NULL, 10);
   6240 
   6241    *x = width;
   6242    *y = height;
   6243 
   6244    if (comp) *comp = 3;
   6245    if (req_comp == 0) req_comp = 3;
   6246 
   6247    if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
   6248       return stbi__errpf("too large", "HDR image is too large");
   6249 
   6250    // Read data
   6251    hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
   6252    if (!hdr_data)
   6253       return stbi__errpf("outofmem", "Out of memory");
   6254 
   6255    // Load image data
   6256    // image data is stored as some number of sca
   6257    if ( width < 8 || width >= 32768) {
   6258       // Read flat data
   6259       for (j=0; j < height; ++j) {
   6260          for (i=0; i < width; ++i) {
   6261             stbi_uc rgbe[4];
   6262            main_decode_loop:
   6263             stbi__getn(s, rgbe, 4);
   6264             stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
   6265          }
   6266       }
   6267    } else {
   6268       // Read RLE-encoded data
   6269       scanline = NULL;
   6270 
   6271       for (j = 0; j < height; ++j) {
   6272          c1 = stbi__get8(s);
   6273          c2 = stbi__get8(s);
   6274          len = stbi__get8(s);
   6275          if (c1 != 2 || c2 != 2 || (len & 0x80)) {
   6276             // not run-length encoded, so we have to actually use THIS data as a decoded
   6277             // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
   6278             stbi_uc rgbe[4];
   6279             rgbe[0] = (stbi_uc) c1;
   6280             rgbe[1] = (stbi_uc) c2;
   6281             rgbe[2] = (stbi_uc) len;
   6282             rgbe[3] = (stbi_uc) stbi__get8(s);
   6283             stbi__hdr_convert(hdr_data, rgbe, req_comp);
   6284             i = 1;
   6285             j = 0;
   6286             STBI_FREE(scanline);
   6287             goto main_decode_loop; // yes, this makes no sense
   6288          }
   6289          len <<= 8;
   6290          len |= stbi__get8(s);
   6291          if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
   6292          if (scanline == NULL) {
   6293             scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
   6294             if (!scanline) {
   6295                STBI_FREE(hdr_data);
   6296                return stbi__errpf("outofmem", "Out of memory");
   6297             }
   6298          }
   6299 
   6300          for (k = 0; k < 4; ++k) {
   6301             int nleft;
   6302             i = 0;
   6303             while ((nleft = width - i) > 0) {
   6304                count = stbi__get8(s);
   6305                if (count > 128) {
   6306                   // Run
   6307                   value = stbi__get8(s);
   6308                   count -= 128;
   6309                   if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
   6310                   for (z = 0; z < count; ++z)
   6311                      scanline[i++ * 4 + k] = value;
   6312                } else {
   6313                   // Dump
   6314                   if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
   6315                   for (z = 0; z < count; ++z)
   6316                      scanline[i++ * 4 + k] = stbi__get8(s);
   6317                }
   6318             }
   6319          }
   6320          for (i=0; i < width; ++i)
   6321             stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
   6322       }
   6323       if (scanline)
   6324          STBI_FREE(scanline);
   6325    }
   6326 
   6327    return hdr_data;
   6328 }
   6329 
   6330 static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
   6331 {
   6332    char buffer[STBI__HDR_BUFLEN];
   6333    char *token;
   6334    int valid = 0;
   6335    int dummy;
   6336 
   6337    if (!x) x = &dummy;
   6338    if (!y) y = &dummy;
   6339    if (!comp) comp = &dummy;
   6340 
   6341    if (stbi__hdr_test(s) == 0) {
   6342        stbi__rewind( s );
   6343        return 0;
   6344    }
   6345 
   6346    for(;;) {
   6347       token = stbi__hdr_gettoken(s,buffer);
   6348       if (token[0] == 0) break;
   6349       if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
   6350    }
   6351 
   6352    if (!valid) {
   6353        stbi__rewind( s );
   6354        return 0;
   6355    }
   6356    token = stbi__hdr_gettoken(s,buffer);
   6357    if (strncmp(token, "-Y ", 3)) {
   6358        stbi__rewind( s );
   6359        return 0;
   6360    }
   6361    token += 3;
   6362    *y = (int) strtol(token, &token, 10);
   6363    while (*token == ' ') ++token;
   6364    if (strncmp(token, "+X ", 3)) {
   6365        stbi__rewind( s );
   6366        return 0;
   6367    }
   6368    token += 3;
   6369    *x = (int) strtol(token, NULL, 10);
   6370    *comp = 3;
   6371    return 1;
   6372 }
   6373 #endif // STBI_NO_HDR
   6374 
   6375 #ifndef STBI_NO_BMP
   6376 static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
   6377 {
   6378    void *p;
   6379    stbi__bmp_data info;
   6380 
   6381    info.all_a = 255;
   6382    p = stbi__bmp_parse_header(s, &info);
   6383    stbi__rewind( s );
   6384    if (p == NULL)
   6385       return 0;
   6386    if (x) *x = s->img_x;
   6387    if (y) *y = s->img_y;
   6388    if (comp) *comp = info.ma ? 4 : 3;
   6389    return 1;
   6390 }
   6391 #endif
   6392 
   6393 #ifndef STBI_NO_PSD
   6394 static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
   6395 {
   6396    int channelCount, dummy, depth;
   6397    if (!x) x = &dummy;
   6398    if (!y) y = &dummy;
   6399    if (!comp) comp = &dummy;
   6400    if (stbi__get32be(s) != 0x38425053) {
   6401        stbi__rewind( s );
   6402        return 0;
   6403    }
   6404    if (stbi__get16be(s) != 1) {
   6405        stbi__rewind( s );
   6406        return 0;
   6407    }
   6408    stbi__skip(s, 6);
   6409    channelCount = stbi__get16be(s);
   6410    if (channelCount < 0 || channelCount > 16) {
   6411        stbi__rewind( s );
   6412        return 0;
   6413    }
   6414    *y = stbi__get32be(s);
   6415    *x = stbi__get32be(s);
   6416    depth = stbi__get16be(s);
   6417    if (depth != 8 && depth != 16) {
   6418        stbi__rewind( s );
   6419        return 0;
   6420    }
   6421    if (stbi__get16be(s) != 3) {
   6422        stbi__rewind( s );
   6423        return 0;
   6424    }
   6425    *comp = 4;
   6426    return 1;
   6427 }
   6428 
   6429 static int stbi__psd_is16(stbi__context *s)
   6430 {
   6431    int channelCount, depth;
   6432    if (stbi__get32be(s) != 0x38425053) {
   6433        stbi__rewind( s );
   6434        return 0;
   6435    }
   6436    if (stbi__get16be(s) != 1) {
   6437        stbi__rewind( s );
   6438        return 0;
   6439    }
   6440    stbi__skip(s, 6);
   6441    channelCount = stbi__get16be(s);
   6442    if (channelCount < 0 || channelCount > 16) {
   6443        stbi__rewind( s );
   6444        return 0;
   6445    }
   6446    (void) stbi__get32be(s);
   6447    (void) stbi__get32be(s);
   6448    depth = stbi__get16be(s);
   6449    if (depth != 16) {
   6450        stbi__rewind( s );
   6451        return 0;
   6452    }
   6453    return 1;
   6454 }
   6455 #endif
   6456 
   6457 #ifndef STBI_NO_PIC
   6458 static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
   6459 {
   6460    int act_comp=0,num_packets=0,chained,dummy;
   6461    stbi__pic_packet packets[10];
   6462 
   6463    if (!x) x = &dummy;
   6464    if (!y) y = &dummy;
   6465    if (!comp) comp = &dummy;
   6466 
   6467    if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
   6468       stbi__rewind(s);
   6469       return 0;
   6470    }
   6471 
   6472    stbi__skip(s, 88);
   6473 
   6474    *x = stbi__get16be(s);
   6475    *y = stbi__get16be(s);
   6476    if (stbi__at_eof(s)) {
   6477       stbi__rewind( s);
   6478       return 0;
   6479    }
   6480    if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
   6481       stbi__rewind( s );
   6482       return 0;
   6483    }
   6484 
   6485    stbi__skip(s, 8);
   6486 
   6487    do {
   6488       stbi__pic_packet *packet;
   6489 
   6490       if (num_packets==sizeof(packets)/sizeof(packets[0]))
   6491          return 0;
   6492 
   6493       packet = &packets[num_packets++];
   6494       chained = stbi__get8(s);
   6495       packet->size    = stbi__get8(s);
   6496       packet->type    = stbi__get8(s);
   6497       packet->channel = stbi__get8(s);
   6498       act_comp |= packet->channel;
   6499 
   6500       if (stbi__at_eof(s)) {
   6501           stbi__rewind( s );
   6502           return 0;
   6503       }
   6504       if (packet->size != 8) {
   6505           stbi__rewind( s );
   6506           return 0;
   6507       }
   6508    } while (chained);
   6509 
   6510    *comp = (act_comp & 0x10 ? 4 : 3);
   6511 
   6512    return 1;
   6513 }
   6514 #endif
   6515 
   6516 // *************************************************************************************************
   6517 // Portable Gray Map and Portable Pixel Map loader
   6518 // by Ken Miller
   6519 //
   6520 // PGM: http://netpbm.sourceforge.net/doc/pgm.html
   6521 // PPM: http://netpbm.sourceforge.net/doc/ppm.html
   6522 //
   6523 // Known limitations:
   6524 //    Does not support comments in the header section
   6525 //    Does not support ASCII image data (formats P2 and P3)
   6526 //    Does not support 16-bit-per-channel
   6527 
   6528 #ifndef STBI_NO_PNM
   6529 
   6530 static int      stbi__pnm_test(stbi__context *s)
   6531 {
   6532    char p, t;
   6533    p = (char) stbi__get8(s);
   6534    t = (char) stbi__get8(s);
   6535    if (p != 'P' || (t != '5' && t != '6')) {
   6536        stbi__rewind( s );
   6537        return 0;
   6538    }
   6539    return 1;
   6540 }
   6541 
   6542 static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
   6543 {
   6544    stbi_uc *out;
   6545    STBI_NOTUSED(ri);
   6546 
   6547    if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
   6548       return 0;
   6549 
   6550    *x = s->img_x;
   6551    *y = s->img_y;
   6552    if (comp) *comp = s->img_n;
   6553 
   6554    if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
   6555       return stbi__errpuc("too large", "PNM too large");
   6556 
   6557    out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
   6558    if (!out) return stbi__errpuc("outofmem", "Out of memory");
   6559    stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
   6560 
   6561    if (req_comp && req_comp != s->img_n) {
   6562       out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
   6563       if (out == NULL) return out; // stbi__convert_format frees input on failure
   6564    }
   6565    return out;
   6566 }
   6567 
   6568 static int      stbi__pnm_isspace(char c)
   6569 {
   6570    return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
   6571 }
   6572 
   6573 static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
   6574 {
   6575    for (;;) {
   6576       while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
   6577          *c = (char) stbi__get8(s);
   6578 
   6579       if (stbi__at_eof(s) || *c != '#')
   6580          break;
   6581 
   6582       while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
   6583          *c = (char) stbi__get8(s);
   6584    }
   6585 }
   6586 
   6587 static int      stbi__pnm_isdigit(char c)
   6588 {
   6589    return c >= '0' && c <= '9';
   6590 }
   6591 
   6592 static int      stbi__pnm_getinteger(stbi__context *s, char *c)
   6593 {
   6594    int value = 0;
   6595 
   6596    while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
   6597       value = value*10 + (*c - '0');
   6598       *c = (char) stbi__get8(s);
   6599    }
   6600 
   6601    return value;
   6602 }
   6603 
   6604 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
   6605 {
   6606    int maxv, dummy;
   6607    char c, p, t;
   6608 
   6609    if (!x) x = &dummy;
   6610    if (!y) y = &dummy;
   6611    if (!comp) comp = &dummy;
   6612 
   6613    stbi__rewind(s);
   6614 
   6615    // Get identifier
   6616    p = (char) stbi__get8(s);
   6617    t = (char) stbi__get8(s);
   6618    if (p != 'P' || (t != '5' && t != '6')) {
   6619        stbi__rewind(s);
   6620        return 0;
   6621    }
   6622 
   6623    *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
   6624 
   6625    c = (char) stbi__get8(s);
   6626    stbi__pnm_skip_whitespace(s, &c);
   6627 
   6628    *x = stbi__pnm_getinteger(s, &c); // read width
   6629    stbi__pnm_skip_whitespace(s, &c);
   6630 
   6631    *y = stbi__pnm_getinteger(s, &c); // read height
   6632    stbi__pnm_skip_whitespace(s, &c);
   6633 
   6634    maxv = stbi__pnm_getinteger(s, &c);  // read max value
   6635 
   6636    if (maxv > 255)
   6637       return stbi__err("max value > 255", "PPM image not 8-bit");
   6638    else
   6639       return 1;
   6640 }
   6641 #endif
   6642 
   6643 static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
   6644 {
   6645    #ifndef STBI_NO_JPEG
   6646    if (stbi__jpeg_info(s, x, y, comp)) return 1;
   6647    #endif
   6648 
   6649    #ifndef STBI_NO_PNG
   6650    if (stbi__png_info(s, x, y, comp))  return 1;
   6651    #endif
   6652 
   6653    #ifndef STBI_NO_GIF
   6654    if (stbi__gif_info(s, x, y, comp))  return 1;
   6655    #endif
   6656 
   6657    #ifndef STBI_NO_BMP
   6658    if (stbi__bmp_info(s, x, y, comp))  return 1;
   6659    #endif
   6660 
   6661    #ifndef STBI_NO_PSD
   6662    if (stbi__psd_info(s, x, y, comp))  return 1;
   6663    #endif
   6664 
   6665    #ifndef STBI_NO_PIC
   6666    if (stbi__pic_info(s, x, y, comp))  return 1;
   6667    #endif
   6668 
   6669    #ifndef STBI_NO_PNM
   6670    if (stbi__pnm_info(s, x, y, comp))  return 1;
   6671    #endif
   6672 
   6673    #ifndef STBI_NO_HDR
   6674    if (stbi__hdr_info(s, x, y, comp))  return 1;
   6675    #endif
   6676 
   6677    // test tga last because it's a crappy test!
   6678    #ifndef STBI_NO_TGA
   6679    if (stbi__tga_info(s, x, y, comp))
   6680        return 1;
   6681    #endif
   6682    return stbi__err("unknown image type", "Image not of any known type, or corrupt");
   6683 }
   6684 
   6685 static int stbi__is_16_main(stbi__context *s)
   6686 {
   6687    #ifndef STBI_NO_PNG
   6688    if (stbi__png_is16(s))  return 1;
   6689    #endif
   6690 
   6691    #ifndef STBI_NO_PSD
   6692    if (stbi__psd_is16(s))  return 1;
   6693    #endif
   6694 
   6695    return 0;
   6696 }
   6697 
   6698 #ifndef STBI_NO_STDIO
   6699 STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
   6700 {
   6701     FILE *f = stbi__fopen(filename, "rb");
   6702     int result;
   6703     if (!f) return stbi__err("can't fopen", "Unable to open file");
   6704     result = stbi_info_from_file(f, x, y, comp);
   6705     fclose(f);
   6706     return result;
   6707 }
   6708 
   6709 STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
   6710 {
   6711    int r;
   6712    stbi__context s;
   6713    long pos = ftell(f);
   6714    stbi__start_file(&s, f);
   6715    r = stbi__info_main(&s,x,y,comp);
   6716    fseek(f,pos,SEEK_SET);
   6717    return r;
   6718 }
   6719 
   6720 STBIDEF int stbi_is_16_bit(char const *filename)
   6721 {
   6722     FILE *f = stbi__fopen(filename, "rb");
   6723     int result;
   6724     if (!f) return stbi__err("can't fopen", "Unable to open file");
   6725     result = stbi_is_16_bit_from_file(f);
   6726     fclose(f);
   6727     return result;
   6728 }
   6729 
   6730 STBIDEF int stbi_is_16_bit_from_file(FILE *f)
   6731 {
   6732    int r;
   6733    stbi__context s;
   6734    long pos = ftell(f);
   6735    stbi__start_file(&s, f);
   6736    r = stbi__is_16_main(&s);
   6737    fseek(f,pos,SEEK_SET);
   6738    return r;
   6739 }
   6740 #endif // !STBI_NO_STDIO
   6741 
   6742 STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
   6743 {
   6744    stbi__context s;
   6745    stbi__start_mem(&s,buffer,len);
   6746    return stbi__info_main(&s,x,y,comp);
   6747 }
   6748 
   6749 STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
   6750 {
   6751    stbi__context s;
   6752    stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
   6753    return stbi__info_main(&s,x,y,comp);
   6754 }
   6755 
   6756 STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
   6757 {
   6758    stbi__context s;
   6759    stbi__start_mem(&s,buffer,len);
   6760    return stbi__is_16_main(&s);
   6761 }
   6762 
   6763 STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
   6764 {
   6765    stbi__context s;
   6766    stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
   6767    return stbi__is_16_main(&s);
   6768 }
   6769