From 029dc67524ad1d2fc9737de4824810e61ce201b0 Mon Sep 17 00:00:00 2001
From: leblane <lewis@bootloader.io>
Date: Fri, 28 Apr 2023 15:47:06 +0300
Subject: [PATCH] Initial code commit

---
 LICENCE.txt                                   |  109 +
 README.md                                     |   14 +
 Source/HalfFloat.cpp                          | 1039 +++
 Source/HalfFloat.h                            |    8 +
 Source/KHR/khr_df.h                           |  619 ++
 Source/Main.cpp                               |  619 ++
 Source/createdfd.cpp                          |  659 ++
 Source/dfd.h                                  |  173 +
 Source/ispc_texcomp/ispc_texcomp.cpp          |  557 ++
 Source/ispc_texcomp/ispc_texcomp.def          |   30 +
 Source/ispc_texcomp/ispc_texcomp.h            |  128 +
 Source/ispc_texcomp/ispc_texcomp.vcxproj      |  177 +
 .../ispc_texcomp/ispc_texcomp.vcxproj.filters |   62 +
 Source/ispc_texcomp/ispc_texcomp_astc.cpp     |  564 ++
 Source/ispc_texcomp/kernel.ispc               | 3798 ++++++++
 Source/ispc_texcomp/kernel_astc.ispc          | 2272 +++++
 Source/meson.build                            |   26 +
 Source/stb_image.cpp                          |    2 +
 Source/stb_image.h                            | 7987 +++++++++++++++++
 Source/stb_image_resize.cpp                   |    2 +
 Source/stb_image_resize.h                     | 2634 ++++++
 Source/vk2dfd.cpp                             |   33 +
 Source/vk2dfd.inl                             |  294 +
 build/.keep                                   |    0
 meson.build                                   |    3 +
 25 files changed, 21809 insertions(+)
 create mode 100644 LICENCE.txt
 create mode 100644 README.md
 create mode 100644 Source/HalfFloat.cpp
 create mode 100644 Source/HalfFloat.h
 create mode 100644 Source/KHR/khr_df.h
 create mode 100644 Source/Main.cpp
 create mode 100644 Source/createdfd.cpp
 create mode 100644 Source/dfd.h
 create mode 100644 Source/ispc_texcomp/ispc_texcomp.cpp
 create mode 100644 Source/ispc_texcomp/ispc_texcomp.def
 create mode 100644 Source/ispc_texcomp/ispc_texcomp.h
 create mode 100644 Source/ispc_texcomp/ispc_texcomp.vcxproj
 create mode 100644 Source/ispc_texcomp/ispc_texcomp.vcxproj.filters
 create mode 100644 Source/ispc_texcomp/ispc_texcomp_astc.cpp
 create mode 100644 Source/ispc_texcomp/kernel.ispc
 create mode 100644 Source/ispc_texcomp/kernel_astc.ispc
 create mode 100644 Source/meson.build
 create mode 100644 Source/stb_image.cpp
 create mode 100644 Source/stb_image.h
 create mode 100644 Source/stb_image_resize.cpp
 create mode 100644 Source/stb_image_resize.h
 create mode 100644 Source/vk2dfd.cpp
 create mode 100644 Source/vk2dfd.inl
 create mode 100644 build/.keep
 create mode 100644 meson.build

diff --git a/LICENCE.txt b/LICENCE.txt
new file mode 100644
index 0000000..7ebf941
--- /dev/null
+++ b/LICENCE.txt
@@ -0,0 +1,109 @@
+TextureTaffy
+============
+
+MIT License
+
+Copyright (c) 2023 leblane
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+stb_image and stb_image_resize
+------------------------------
+[stb_image and stb_image_resize] are available under 2 licenses -- choose whichever you prefer.
+
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+
+
+ISPC Texture Compressor
+-----------------------
+
+Copyright 2017 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+Khronos DFD Components
+----------------------
+
+** Copyright 2015-2020 The Khronos Group Inc.
+** SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e74b8a1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,14 @@
+# TextureTaffy
+
+A utility to create compressed textures, in BC1 (DXT1), BC3 (DXT5), BC4, BC5, BC6(U)H and BC7 compression formats, with the [KTX File Format Version 2.0](https://registry.khronos.org/KTX/specs/2.0/ktxspec.v2.html) (KTX2).
+
+## Requirements
+
+* [The Meson build system](https://mesonbuild.com/)
+* [Intel® Implicit SPMD Program Compiler](https://ispc.github.io/)
+
+## Building
+
+
+
+## Notes and limitations
\ No newline at end of file
diff --git a/Source/HalfFloat.cpp b/Source/HalfFloat.cpp
new file mode 100644
index 0000000..658e984
--- /dev/null
+++ b/Source/HalfFloat.cpp
@@ -0,0 +1,1039 @@
+#include "HalfFloat.h"
+
+namespace HalfFloat {
+  static const uint16_t basetable[512] = {
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0000,
+    0x0001,
+    0x0002,
+    0x0004,
+    0x0008,
+    0x0010,
+    0x0020,
+    0x0040,
+    0x0080,
+    0x0100,
+    0x0200,
+    0x0400,
+    0x0800,
+    0x0c00,
+    0x1000,
+    0x1400,
+    0x1800,
+    0x1c00,
+    0x2000,
+    0x2400,
+    0x2800,
+    0x2c00,
+    0x3000,
+    0x3400,
+    0x3800,
+    0x3c00,
+    0x4000,
+    0x4400,
+    0x4800,
+    0x4c00,
+    0x5000,
+    0x5400,
+    0x5800,
+    0x5c00,
+    0x6000,
+    0x6400,
+    0x6800,
+    0x6c00,
+    0x7000,
+    0x7400,
+    0x7800,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x7c00,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8000,
+    0x8001,
+    0x8002,
+    0x8004,
+    0x8008,
+    0x8010,
+    0x8020,
+    0x8040,
+    0x8080,
+    0x8100,
+    0x8200,
+    0x8400,
+    0x8800,
+    0x8c00,
+    0x9000,
+    0x9400,
+    0x9800,
+    0x9c00,
+    0xa000,
+    0xa400,
+    0xa800,
+    0xac00,
+    0xb000,
+    0xb400,
+    0xb800,
+    0xbc00,
+    0xc000,
+    0xc400,
+    0xc800,
+    0xcc00,
+    0xd000,
+    0xd400,
+    0xd800,
+    0xdc00,
+    0xe000,
+    0xe400,
+    0xe800,
+    0xec00,
+    0xf000,
+    0xf400,
+    0xf800,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00,
+    0xfc00
+  };
+
+  static const uint8_t shifttable[512] = {
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x17,
+    0x16,
+    0x15,
+    0x14,
+    0x13,
+    0x12,
+    0x11,
+    0x10,
+    0x0f,
+    0x0e,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x0d,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x17,
+    0x16,
+    0x15,
+    0x14,
+    0x13,
+    0x12,
+    0x11,
+    0x10,
+    0x0f,
+    0x0e,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x0d,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x18,
+    0x0d
+  };
+
+  uint16_t FromFloat(float x)
+  {
+    uint32_t f = *reinterpret_cast<uint32_t *>(&x);
+    return basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> shifttable[(f >> 23) & 0x1ff]);
+  }
+}
diff --git a/Source/HalfFloat.h b/Source/HalfFloat.h
new file mode 100644
index 0000000..85a7303
--- /dev/null
+++ b/Source/HalfFloat.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <cstdint>
+
+namespace HalfFloat
+{
+  uint16_t FromFloat(float x);
+};
\ No newline at end of file
diff --git a/Source/KHR/khr_df.h b/Source/KHR/khr_df.h
new file mode 100644
index 0000000..bbd0d14
--- /dev/null
+++ b/Source/KHR/khr_df.h
@@ -0,0 +1,619 @@
+/* The Khronos Data Format Specification (version 1.3) */
+/*
+** Copyright 2015-2020 The Khronos Group Inc.
+** SPDX-License-Identifier: Apache-2.0
+*/
+
+/* This header defines a structure that can describe the layout of image
+   formats in memory. This means that the data format is transparent to
+   the application, and the expectation is that this should be used when
+   the layout is defined external to the API. Many Khronos APIs deliberately
+   keep the internal layout of images opaque, to allow proprietary layouts
+   and optimisations. This structure is not appropriate for describing
+   opaque layouts. */
+
+/* We stick to standard C89 constructs for simplicity and portability. */
+
+#ifndef _KHR_DATA_FORMAT_H_
+#define _KHR_DATA_FORMAT_H_
+
+/* Accessors */
+typedef enum _khr_word_e {
+    KHR_DF_WORD_VENDORID = 0U,
+    KHR_DF_WORD_DESCRIPTORTYPE = 0U,
+    KHR_DF_WORD_VERSIONNUMBER = 1U,
+    KHR_DF_WORD_DESCRIPTORBLOCKSIZE = 1U,
+    KHR_DF_WORD_MODEL = 2U,
+    KHR_DF_WORD_PRIMARIES = 2U,
+    KHR_DF_WORD_TRANSFER = 2U,
+    KHR_DF_WORD_FLAGS = 2U,
+    KHR_DF_WORD_TEXELBLOCKDIMENSION0 = 3U,
+    KHR_DF_WORD_TEXELBLOCKDIMENSION1 = 3U,
+    KHR_DF_WORD_TEXELBLOCKDIMENSION2 = 3U,
+    KHR_DF_WORD_TEXELBLOCKDIMENSION3 = 3U,
+    KHR_DF_WORD_BYTESPLANE0 = 4U,
+    KHR_DF_WORD_BYTESPLANE1 = 4U,
+    KHR_DF_WORD_BYTESPLANE2 = 4U,
+    KHR_DF_WORD_BYTESPLANE3 = 4U,
+    KHR_DF_WORD_BYTESPLANE4 = 5U,
+    KHR_DF_WORD_BYTESPLANE5 = 5U,
+    KHR_DF_WORD_BYTESPLANE6 = 5U,
+    KHR_DF_WORD_BYTESPLANE7 = 5U,
+    KHR_DF_WORD_SAMPLESTART = 6U,
+    KHR_DF_WORD_SAMPLEWORDS = 4U
+} khr_df_word_e;
+
+typedef enum _khr_df_shift_e {
+    KHR_DF_SHIFT_VENDORID = 0U,
+    KHR_DF_SHIFT_DESCRIPTORTYPE = 17U,
+    KHR_DF_SHIFT_VERSIONNUMBER = 0U,
+    KHR_DF_SHIFT_DESCRIPTORBLOCKSIZE = 16U,
+    KHR_DF_SHIFT_MODEL = 0U,
+    KHR_DF_SHIFT_PRIMARIES = 8U,
+    KHR_DF_SHIFT_TRANSFER = 16U,
+    KHR_DF_SHIFT_FLAGS = 24U,
+    KHR_DF_SHIFT_TEXELBLOCKDIMENSION0 = 0U,
+    KHR_DF_SHIFT_TEXELBLOCKDIMENSION1 = 8U,
+    KHR_DF_SHIFT_TEXELBLOCKDIMENSION2 = 16U,
+    KHR_DF_SHIFT_TEXELBLOCKDIMENSION3 = 24U,
+    KHR_DF_SHIFT_BYTESPLANE0 = 0U,
+    KHR_DF_SHIFT_BYTESPLANE1 = 8U,
+    KHR_DF_SHIFT_BYTESPLANE2 = 16U,
+    KHR_DF_SHIFT_BYTESPLANE3 = 24U,
+    KHR_DF_SHIFT_BYTESPLANE4 = 0U,
+    KHR_DF_SHIFT_BYTESPLANE5 = 8U,
+    KHR_DF_SHIFT_BYTESPLANE6 = 16U,
+    KHR_DF_SHIFT_BYTESPLANE7 = 24U
+} khr_df_shift_e;
+
+typedef enum _khr_df_mask_e {
+    KHR_DF_MASK_VENDORID = 0x1FFFFU,
+    KHR_DF_MASK_DESCRIPTORTYPE = 0x7FFFU,
+    KHR_DF_MASK_VERSIONNUMBER = 0xFFFFU,
+    KHR_DF_MASK_DESCRIPTORBLOCKSIZE = 0xFFFFU,
+    KHR_DF_MASK_MODEL = 0xFFU,
+    KHR_DF_MASK_PRIMARIES = 0xFFU,
+    KHR_DF_MASK_TRANSFER = 0xFFU,
+    KHR_DF_MASK_FLAGS = 0xFFU,
+    KHR_DF_MASK_TEXELBLOCKDIMENSION0 = 0xFFU,
+    KHR_DF_MASK_TEXELBLOCKDIMENSION1 = 0xFFU,
+    KHR_DF_MASK_TEXELBLOCKDIMENSION2 = 0xFFU,
+    KHR_DF_MASK_TEXELBLOCKDIMENSION3 = 0xFFU,
+    KHR_DF_MASK_BYTESPLANE0 = 0xFFU,
+    KHR_DF_MASK_BYTESPLANE1 = 0xFFU,
+    KHR_DF_MASK_BYTESPLANE2 = 0xFFU,
+    KHR_DF_MASK_BYTESPLANE3 = 0xFFU,
+    KHR_DF_MASK_BYTESPLANE4 = 0xFFU,
+    KHR_DF_MASK_BYTESPLANE5 = 0xFFU,
+    KHR_DF_MASK_BYTESPLANE6 = 0xFFU,
+    KHR_DF_MASK_BYTESPLANE7 = 0xFFU
+} khr_df_mask_e;
+
+/* Helper macro:
+   Extract field X from basic descriptor block BDB */
+#define KHR_DFDVAL(BDB, X) \
+    (((BDB)[KHR_DF_WORD_ ## X] >> (KHR_DF_SHIFT_ ## X)) \
+     & (KHR_DF_MASK_ ## X))
+
+/* Helper macro:
+   Set field X of basic descriptor block BDB */
+#define KHR_DFDSETVAL(BDB, X, val) \
+    ((BDB)[KHR_DF_WORD_ ## X] = \
+     ((BDB)[KHR_DF_WORD_ ## X] & \
+      ~((KHR_DF_MASK_ ## X) << (KHR_DF_SHIFT_ ## X))) | \
+     (((val) & (KHR_DF_MASK_ ## X)) << (KHR_DF_SHIFT_ ## X)))
+
+/* Offsets relative to the start of a sample */
+typedef enum _khr_df_sampleword_e {
+    KHR_DF_SAMPLEWORD_BITOFFSET = 0U,
+    KHR_DF_SAMPLEWORD_BITLENGTH = 0U,
+    KHR_DF_SAMPLEWORD_CHANNELID = 0U,
+    KHR_DF_SAMPLEWORD_QUALIFIERS = 0U,
+    KHR_DF_SAMPLEWORD_SAMPLEPOSITION0 = 1U,
+    KHR_DF_SAMPLEWORD_SAMPLEPOSITION1 = 1U,
+    KHR_DF_SAMPLEWORD_SAMPLEPOSITION2 = 1U,
+    KHR_DF_SAMPLEWORD_SAMPLEPOSITION3 = 1U,
+    KHR_DF_SAMPLEWORD_SAMPLEPOSITION_ALL = 1U,
+    KHR_DF_SAMPLEWORD_SAMPLELOWER = 2U,
+    KHR_DF_SAMPLEWORD_SAMPLEUPPER = 3U
+} khr_df_sampleword_e;
+
+typedef enum _khr_df_sampleshift_e {
+    KHR_DF_SAMPLESHIFT_BITOFFSET = 0U,
+    KHR_DF_SAMPLESHIFT_BITLENGTH = 16U,
+    KHR_DF_SAMPLESHIFT_CHANNELID = 24U,
+    /* N.B. Qualifiers are defined as an offset into a byte */
+    KHR_DF_SAMPLESHIFT_QUALIFIERS = 24U,
+    KHR_DF_SAMPLESHIFT_SAMPLEPOSITION0 = 0U,
+    KHR_DF_SAMPLESHIFT_SAMPLEPOSITION1 = 8U,
+    KHR_DF_SAMPLESHIFT_SAMPLEPOSITION2 = 16U,
+    KHR_DF_SAMPLESHIFT_SAMPLEPOSITION3 = 24U,
+    KHR_DF_SAMPLESHIFT_SAMPLEPOSITION_ALL = 0U,
+    KHR_DF_SAMPLESHIFT_SAMPLELOWER = 0U,
+    KHR_DF_SAMPLESHIFT_SAMPLEUPPER = 0U
+} khr_df_sampleshift_e;
+
+typedef enum _khr_df_samplemask_e {
+    KHR_DF_SAMPLEMASK_BITOFFSET = 0xFFFFU,
+    KHR_DF_SAMPLEMASK_BITLENGTH = 0xFF,
+    KHR_DF_SAMPLEMASK_CHANNELID = 0xF,
+    /* N.B. Qualifiers are defined as an offset into a byte */
+    KHR_DF_SAMPLEMASK_QUALIFIERS = 0xF0,
+    KHR_DF_SAMPLEMASK_SAMPLEPOSITION0 = 0xFF,
+    KHR_DF_SAMPLEMASK_SAMPLEPOSITION1 = 0xFF,
+    KHR_DF_SAMPLEMASK_SAMPLEPOSITION2 = 0xFF,
+    KHR_DF_SAMPLEMASK_SAMPLEPOSITION3 = 0xFF,
+    /* ISO C restricts enum values to range of int hence the
+       cast. We do it verbosely instead of using -1 to ensure
+       it is a 32-bit value even if int is 64 bits. */
+    KHR_DF_SAMPLEMASK_SAMPLEPOSITION_ALL = (int) 0xFFFFFFFFU,
+    KHR_DF_SAMPLEMASK_SAMPLELOWER = (int) 0xFFFFFFFFU,
+    KHR_DF_SAMPLEMASK_SAMPLEUPPER = (int) 0xFFFFFFFFU
+} khr_df_samplemask_e;
+
+/* Helper macro:
+   Extract field X of sample S from basic descriptor block BDB */
+#define KHR_DFDSVAL(BDB, S, X) \
+    (((BDB)[KHR_DF_WORD_SAMPLESTART + \
+            ((S) * KHR_DF_WORD_SAMPLEWORDS) + \
+            KHR_DF_SAMPLEWORD_ ## X] >> (KHR_DF_SAMPLESHIFT_ ## X)) \
+     & (KHR_DF_SAMPLEMASK_ ## X))
+
+/* Helper macro:
+   Set field X of sample S of basic descriptor block BDB */
+#define KHR_DFDSETSVAL(BDB, S, X, val) \
+    ((BDB)[KHR_DF_WORD_SAMPLESTART + \
+           ((S) * KHR_DF_WORD_SAMPLEWORDS) + \
+           KHR_DF_SAMPLEWORD_ ## X] = \
+     ((BDB)[KHR_DF_WORD_SAMPLESTART + \
+            ((S) * KHR_DF_WORD_SAMPLEWORDS) + \
+            KHR_DF_SAMPLEWORD_ ## X] & \
+      ~((uint32_t)(KHR_DF_SAMPLEMASK_ ## X) << (KHR_DF_SAMPLESHIFT_ ## X))) | \
+     (((val) & (uint32_t)(KHR_DF_SAMPLEMASK_ ## X)) << (KHR_DF_SAMPLESHIFT_ ## X)))
+
+/* Helper macro:
+   Number of samples in basic descriptor block BDB */
+#define KHR_DFDSAMPLECOUNT(BDB) \
+    (((KHR_DFDVAL(BDB, DESCRIPTORBLOCKSIZE) >> 2) - \
+      KHR_DF_WORD_SAMPLESTART) \
+     / KHR_DF_WORD_SAMPLEWORDS)
+
+/* Helper macro:
+   Size in words of basic descriptor block for S samples */
+#define KHR_DFDSIZEWORDS(S) \
+    (KHR_DF_WORD_SAMPLESTART + \
+     (S) * KHR_DF_WORD_SAMPLEWORDS)
+
+/* Vendor ids */
+typedef enum _khr_df_vendorid_e {
+    /* Standard Khronos descriptor */
+    KHR_DF_VENDORID_KHRONOS = 0U,
+    KHR_DF_VENDORID_MAX     = 0x1FFFFU
+} khr_df_vendorid_e;
+
+/* Descriptor types */
+typedef enum _khr_df_khr_descriptortype_e {
+    /* Default Khronos basic descriptor block */
+    KHR_DF_KHR_DESCRIPTORTYPE_BASICFORMAT = 0U,
+    /* Extension descriptor block for additional planes */
+    KHR_DF_KHR_DESCRIPTORTYPE_ADDITIONAL_PLANES = 0x6001U,
+    /* Extension descriptor block for additional dimensions */
+    KHR_DF_KHR_DESCRIPTORTYPE_ADDITIONAL_DIMENSIONS = 0x6002U,
+    /* Bit indicates modifying requires understanding this extension */
+    KHR_DF_KHR_DESCRIPTORTYPE_NEEDED_FOR_WRITE_BIT = 0x2000U,
+    /* Bit indicates processing requires understanding this extension */
+    KHR_DF_KHR_DESCRIPTORTYPE_NEEDED_FOR_DECODE_BIT = 0x4000U,
+    KHR_DF_KHR_DESCRIPTORTYPE_MAX         = 0x7FFFU
+} khr_df_khr_descriptortype_e;
+
+/* Descriptor block version */
+typedef enum _khr_df_versionnumber_e {
+    /* Standard Khronos descriptor */
+    KHR_DF_VERSIONNUMBER_1_0 = 0U, /* Version 1.0 of the specification */
+    KHR_DF_VERSIONNUMBER_1_1 = 0U, /* Version 1.1 did not bump the version number */
+    KHR_DF_VERSIONNUMBER_1_2 = 1U, /* Version 1.2 increased the version number */
+    KHR_DF_VERSIONNUMBER_1_3 = 2U, /* Version 1.3 increased the version number */
+    KHR_DF_VERSIONNUMBER_LATEST = KHR_DF_VERSIONNUMBER_1_3,
+    KHR_DF_VERSIONNUMBER_MAX = 0xFFFFU
+} khr_df_versionnumber_e;
+
+/* Model in which the color coordinate space is defined.
+   There is no requirement that a color format use all the
+   channel types that are defined in the color model. */
+typedef enum _khr_df_model_e {
+    /* No interpretation of color channels defined */
+    KHR_DF_MODEL_UNSPECIFIED  = 0U,
+    /* Color primaries (red, green, blue) + alpha, depth and stencil */
+    KHR_DF_MODEL_RGBSDA       = 1U,
+    /* Color differences (Y', Cb, Cr) + alpha, depth and stencil */
+    KHR_DF_MODEL_YUVSDA       = 2U,
+    /* Color differences (Y', I, Q) + alpha, depth and stencil */
+    KHR_DF_MODEL_YIQSDA       = 3U,
+    /* Perceptual color (CIE L*a*b*) + alpha, depth and stencil */
+    KHR_DF_MODEL_LABSDA       = 4U,
+    /* Subtractive colors (cyan, magenta, yellow, black) + alpha */
+    KHR_DF_MODEL_CMYKA        = 5U,
+    /* Non-color coordinate data (X, Y, Z, W) */
+    KHR_DF_MODEL_XYZW         = 6U,
+    /* Hue, saturation, value, hue angle on color circle, plus alpha */
+    KHR_DF_MODEL_HSVA_ANG     = 7U,
+    /* Hue, saturation, lightness, hue angle on color circle, plus alpha */
+    KHR_DF_MODEL_HSLA_ANG     = 8U,
+    /* Hue, saturation, value, hue on color hexagon, plus alpha */
+    KHR_DF_MODEL_HSVA_HEX     = 9U,
+    /* Hue, saturation, lightness, hue on color hexagon, plus alpha */
+    KHR_DF_MODEL_HSLA_HEX     = 10U,
+    /* Lightweight approximate color difference (luma, orange, green) */
+    KHR_DF_MODEL_YCGCOA       = 11U,
+    /* ITU BT.2020 constant luminance YcCbcCrc */
+    KHR_DF_MODEL_YCCBCCRC     = 12U,
+    /* ITU BT.2100 constant intensity ICtCp */
+    KHR_DF_MODEL_ICTCP        = 13U,
+    /* CIE 1931 XYZ color coordinates (X, Y, Z) */
+    KHR_DF_MODEL_CIEXYZ       = 14U,
+    /* CIE 1931 xyY color coordinates (X, Y, Y) */
+    KHR_DF_MODEL_CIEXYY       = 15U,
+
+    /* Compressed formats start at 128. */
+    /* These compressed formats should generally have a single sample,
+       sited at the 0,0 position of the texel block. Where multiple
+       channels are used to distinguish formats, these should be cosited. */
+    /* Direct3D (and S3) compressed formats */
+    /* Note that premultiplied status is recorded separately */
+    /* DXT1 "channels" are RGB (0), Alpha (1) */
+    /* DXT1/BC1 with one channel is opaque */
+    /* DXT1/BC1 with a cosited alpha sample is transparent */
+    KHR_DF_MODEL_DXT1A         = 128U,
+    KHR_DF_MODEL_BC1A          = 128U,
+    /* DXT2/DXT3/BC2, with explicit 4-bit alpha */
+    KHR_DF_MODEL_DXT2          = 129U,
+    KHR_DF_MODEL_DXT3          = 129U,
+    KHR_DF_MODEL_BC2           = 129U,
+    /* DXT4/DXT5/BC3, with interpolated alpha */
+    KHR_DF_MODEL_DXT4          = 130U,
+    KHR_DF_MODEL_DXT5          = 130U,
+    KHR_DF_MODEL_BC3           = 130U,
+    /* BC4 - single channel interpolated 8-bit data */
+    /* (The UNORM/SNORM variation is recorded in the channel data) */
+    KHR_DF_MODEL_BC4           = 131U,
+    /* BC5 - two channel interpolated 8-bit data */
+    /* (The UNORM/SNORM variation is recorded in the channel data) */
+    KHR_DF_MODEL_BC5           = 132U,
+    /* BC6H - DX11 format for 16-bit float channels */
+    KHR_DF_MODEL_BC6H          = 133U,
+    /* BC7 - DX11 format */
+    KHR_DF_MODEL_BC7           = 134U,
+    /* Gap left for future desktop expansion */
+
+    /* Mobile compressed formats follow */
+    /* A format of ETC1 indicates that the format shall be decodable
+       by an ETC1-compliant decoder and not rely on ETC2 features */
+    KHR_DF_MODEL_ETC1          = 160U,
+    /* A format of ETC2 is permitted to use ETC2 encodings on top of
+       the baseline ETC1 specification */
+    /* The ETC2 format has channels "red", "green", "RGB" and "alpha",
+       which should be cosited samples */
+    /* Punch-through alpha can be distinguished from full alpha by
+       the plane size in bytes required for the texel block */
+    KHR_DF_MODEL_ETC2          = 161U,
+    /* Adaptive Scalable Texture Compression */
+    /* ASTC HDR vs LDR is determined by the float flag in the channel */
+    /* ASTC block size can be distinguished by texel block size */
+    KHR_DF_MODEL_ASTC          = 162U,
+    /* ETC1S is a simplified subset of ETC1 */
+    KHR_DF_MODEL_ETC1S         = 163U,
+    /* PowerVR Texture Compression */
+    KHR_DF_MODEL_PVRTC         = 164U,
+    KHR_DF_MODEL_PVRTC2        = 165U,
+    KHR_DF_MODEL_UASTC         = 166U,
+    /* Proprietary formats (ATITC, etc.) should follow */
+    KHR_DF_MODEL_MAX = 0xFFU
+} khr_df_model_e;
+
+/* Definition of channel names for each color model */
+typedef enum _khr_df_model_channels_e {
+    /* Unspecified format with nominal channel numbering */
+    KHR_DF_CHANNEL_UNSPECIFIED_0  = 0U,
+    KHR_DF_CHANNEL_UNSPECIFIED_1  = 1U,
+    KHR_DF_CHANNEL_UNSPECIFIED_2  = 2U,
+    KHR_DF_CHANNEL_UNSPECIFIED_3  = 3U,
+    KHR_DF_CHANNEL_UNSPECIFIED_4  = 4U,
+    KHR_DF_CHANNEL_UNSPECIFIED_5  = 5U,
+    KHR_DF_CHANNEL_UNSPECIFIED_6  = 6U,
+    KHR_DF_CHANNEL_UNSPECIFIED_7  = 7U,
+    KHR_DF_CHANNEL_UNSPECIFIED_8  = 8U,
+    KHR_DF_CHANNEL_UNSPECIFIED_9  = 9U,
+    KHR_DF_CHANNEL_UNSPECIFIED_10 = 10U,
+    KHR_DF_CHANNEL_UNSPECIFIED_11 = 11U,
+    KHR_DF_CHANNEL_UNSPECIFIED_12 = 12U,
+    KHR_DF_CHANNEL_UNSPECIFIED_13 = 13U,
+    KHR_DF_CHANNEL_UNSPECIFIED_14 = 14U,
+    KHR_DF_CHANNEL_UNSPECIFIED_15 = 15U,
+    /* MODEL_RGBSDA - red, green, blue, stencil, depth, alpha */
+    KHR_DF_CHANNEL_RGBSDA_RED     =  0U,
+    KHR_DF_CHANNEL_RGBSDA_R       =  0U,
+    KHR_DF_CHANNEL_RGBSDA_GREEN   =  1U,
+    KHR_DF_CHANNEL_RGBSDA_G       =  1U,
+    KHR_DF_CHANNEL_RGBSDA_BLUE    =  2U,
+    KHR_DF_CHANNEL_RGBSDA_B       =  2U,
+    KHR_DF_CHANNEL_RGBSDA_STENCIL = 13U,
+    KHR_DF_CHANNEL_RGBSDA_S       = 13U,
+    KHR_DF_CHANNEL_RGBSDA_DEPTH   = 14U,
+    KHR_DF_CHANNEL_RGBSDA_D       = 14U,
+    KHR_DF_CHANNEL_RGBSDA_ALPHA   = 15U,
+    KHR_DF_CHANNEL_RGBSDA_A       = 15U,
+    /* MODEL_YUVSDA - luma, Cb, Cr, stencil, depth, alpha */
+    KHR_DF_CHANNEL_YUVSDA_Y       =  0U,
+    KHR_DF_CHANNEL_YUVSDA_CB      =  1U,
+    KHR_DF_CHANNEL_YUVSDA_U       =  1U,
+    KHR_DF_CHANNEL_YUVSDA_CR      =  2U,
+    KHR_DF_CHANNEL_YUVSDA_V       =  2U,
+    KHR_DF_CHANNEL_YUVSDA_STENCIL = 13U,
+    KHR_DF_CHANNEL_YUVSDA_S       = 13U,
+    KHR_DF_CHANNEL_YUVSDA_DEPTH   = 14U,
+    KHR_DF_CHANNEL_YUVSDA_D       = 14U,
+    KHR_DF_CHANNEL_YUVSDA_ALPHA   = 15U,
+    KHR_DF_CHANNEL_YUVSDA_A       = 15U,
+    /* MODEL_YIQSDA - luma, in-phase, quadrature, stencil, depth, alpha */
+    KHR_DF_CHANNEL_YIQSDA_Y       =  0U,
+    KHR_DF_CHANNEL_YIQSDA_I       =  1U,
+    KHR_DF_CHANNEL_YIQSDA_Q       =  2U,
+    KHR_DF_CHANNEL_YIQSDA_STENCIL = 13U,
+    KHR_DF_CHANNEL_YIQSDA_S       = 13U,
+    KHR_DF_CHANNEL_YIQSDA_DEPTH   = 14U,
+    KHR_DF_CHANNEL_YIQSDA_D       = 14U,
+    KHR_DF_CHANNEL_YIQSDA_ALPHA   = 15U,
+    KHR_DF_CHANNEL_YIQSDA_A       = 15U,
+    /* MODEL_LABSDA - CIELAB/L*a*b* luma, red-green, blue-yellow, stencil, depth, alpha */
+    KHR_DF_CHANNEL_LABSDA_L       =  0U,
+    KHR_DF_CHANNEL_LABSDA_A       =  1U,
+    KHR_DF_CHANNEL_LABSDA_B       =  2U,
+    KHR_DF_CHANNEL_LABSDA_STENCIL = 13U,
+    KHR_DF_CHANNEL_LABSDA_S       = 13U,
+    KHR_DF_CHANNEL_LABSDA_DEPTH   = 14U,
+    KHR_DF_CHANNEL_LABSDA_D       = 14U,
+    KHR_DF_CHANNEL_LABSDA_ALPHA   = 15U,
+    /* NOTE: KHR_DF_CHANNEL_LABSDA_A is not a synonym for alpha! */
+    /* MODEL_CMYKA - cyan, magenta, yellow, key/blacK, alpha */
+    KHR_DF_CHANNEL_CMYKSDA_CYAN    =  0U,
+    KHR_DF_CHANNEL_CMYKSDA_C       =  0U,
+    KHR_DF_CHANNEL_CMYKSDA_MAGENTA =  1U,
+    KHR_DF_CHANNEL_CMYKSDA_M       =  1U,
+    KHR_DF_CHANNEL_CMYKSDA_YELLOW  =  2U,
+    KHR_DF_CHANNEL_CMYKSDA_Y       =  2U,
+    KHR_DF_CHANNEL_CMYKSDA_KEY     =  3U,
+    KHR_DF_CHANNEL_CMYKSDA_BLACK   =  3U,
+    KHR_DF_CHANNEL_CMYKSDA_K       =  3U,
+    KHR_DF_CHANNEL_CMYKSDA_ALPHA   = 15U,
+    KHR_DF_CHANNEL_CMYKSDA_A       = 15U,
+    /* MODEL_XYZW - coordinates x, y, z, w */
+    KHR_DF_CHANNEL_XYZW_X = 0U,
+    KHR_DF_CHANNEL_XYZW_Y = 1U,
+    KHR_DF_CHANNEL_XYZW_Z = 2U,
+    KHR_DF_CHANNEL_XYZW_W = 3U,
+    /* MODEL_HSVA_ANG - value (luma), saturation, hue, alpha, angular projection, conical space */
+    KHR_DF_CHANNEL_HSVA_ANG_VALUE      = 0U,
+    KHR_DF_CHANNEL_HSVA_ANG_V          = 0U,
+    KHR_DF_CHANNEL_HSVA_ANG_SATURATION = 1U,
+    KHR_DF_CHANNEL_HSVA_ANG_S          = 1U,
+    KHR_DF_CHANNEL_HSVA_ANG_HUE        = 2U,
+    KHR_DF_CHANNEL_HSVA_ANG_H          = 2U,
+    KHR_DF_CHANNEL_HSVA_ANG_ALPHA      = 15U,
+    KHR_DF_CHANNEL_HSVA_ANG_A          = 15U,
+    /* MODEL_HSLA_ANG - lightness (luma), saturation, hue, alpha, angular projection, double conical space */
+    KHR_DF_CHANNEL_HSLA_ANG_LIGHTNESS  = 0U,
+    KHR_DF_CHANNEL_HSLA_ANG_L          = 0U,
+    KHR_DF_CHANNEL_HSLA_ANG_SATURATION = 1U,
+    KHR_DF_CHANNEL_HSLA_ANG_S          = 1U,
+    KHR_DF_CHANNEL_HSLA_ANG_HUE        = 2U,
+    KHR_DF_CHANNEL_HSLA_ANG_H          = 2U,
+    KHR_DF_CHANNEL_HSLA_ANG_ALPHA      = 15U,
+    KHR_DF_CHANNEL_HSLA_ANG_A          = 15U,
+    /* MODEL_HSVA_HEX - value (luma), saturation, hue, alpha, hexagonal projection, conical space */
+    KHR_DF_CHANNEL_HSVA_HEX_VALUE      = 0U,
+    KHR_DF_CHANNEL_HSVA_HEX_V          = 0U,
+    KHR_DF_CHANNEL_HSVA_HEX_SATURATION = 1U,
+    KHR_DF_CHANNEL_HSVA_HEX_S          = 1U,
+    KHR_DF_CHANNEL_HSVA_HEX_HUE        = 2U,
+    KHR_DF_CHANNEL_HSVA_HEX_H          = 2U,
+    KHR_DF_CHANNEL_HSVA_HEX_ALPHA      = 15U,
+    KHR_DF_CHANNEL_HSVA_HEX_A          = 15U,
+    /* MODEL_HSLA_HEX - lightness (luma), saturation, hue, alpha, hexagonal projection, double conical space */
+    KHR_DF_CHANNEL_HSLA_HEX_LIGHTNESS  = 0U,
+    KHR_DF_CHANNEL_HSLA_HEX_L          = 0U,
+    KHR_DF_CHANNEL_HSLA_HEX_SATURATION = 1U,
+    KHR_DF_CHANNEL_HSLA_HEX_S          = 1U,
+    KHR_DF_CHANNEL_HSLA_HEX_HUE        = 2U,
+    KHR_DF_CHANNEL_HSLA_HEX_H          = 2U,
+    KHR_DF_CHANNEL_HSLA_HEX_ALPHA      = 15U,
+    KHR_DF_CHANNEL_HSLA_HEX_A          = 15U,
+    /* MODEL_YCGCOA - luma, green delta, orange delta, alpha */
+    KHR_DF_CHANNEL_YCGCOA_Y       =  0U,
+    KHR_DF_CHANNEL_YCGCOA_CG      =  1U,
+    KHR_DF_CHANNEL_YCGCOA_CO      =  2U,
+    KHR_DF_CHANNEL_YCGCOA_ALPHA   = 15U,
+    KHR_DF_CHANNEL_YCGCOA_A       = 15U,
+    /* MODEL_CIEXYZ - CIE 1931 X, Y, Z */
+    KHR_DF_CHANNEL_CIEXYZ_X = 0U,
+    KHR_DF_CHANNEL_CIEXYZ_Y = 1U,
+    KHR_DF_CHANNEL_CIEXYZ_Z = 2U,
+    /* MODEL_CIEXYY - CIE 1931 x, y, Y */
+    KHR_DF_CHANNEL_CIEXYY_X        = 0U,
+    KHR_DF_CHANNEL_CIEXYY_YCHROMA  = 1U,
+    KHR_DF_CHANNEL_CIEXYY_YLUMA    = 2U,
+
+    /* Compressed formats */
+    /* MODEL_DXT1A/MODEL_BC1A */
+    KHR_DF_CHANNEL_DXT1A_COLOR = 0U,
+    KHR_DF_CHANNEL_BC1A_COLOR  = 0U,
+    KHR_DF_CHANNEL_DXT1A_ALPHAPRESENT = 1U,
+    KHR_DF_CHANNEL_DXT1A_ALPHA = 1U,
+    KHR_DF_CHANNEL_BC1A_ALPHAPRESENT  = 1U,
+    KHR_DF_CHANNEL_BC1A_ALPHA  = 1U,
+    /* MODEL_DXT2/3/MODEL_BC2 */
+    KHR_DF_CHANNEL_DXT2_COLOR =  0U,
+    KHR_DF_CHANNEL_DXT3_COLOR =  0U,
+    KHR_DF_CHANNEL_BC2_COLOR  =  0U,
+    KHR_DF_CHANNEL_DXT2_ALPHA = 15U,
+    KHR_DF_CHANNEL_DXT3_ALPHA = 15U,
+    KHR_DF_CHANNEL_BC2_ALPHA  = 15U,
+    /* MODEL_DXT4/5/MODEL_BC3 */
+    KHR_DF_CHANNEL_DXT4_COLOR =  0U,
+    KHR_DF_CHANNEL_DXT5_COLOR =  0U,
+    KHR_DF_CHANNEL_BC3_COLOR  =  0U,
+    KHR_DF_CHANNEL_DXT4_ALPHA = 15U,
+    KHR_DF_CHANNEL_DXT5_ALPHA = 15U,
+    KHR_DF_CHANNEL_BC3_ALPHA  = 15U,
+    /* MODEL_BC4 */
+    KHR_DF_CHANNEL_BC4_DATA = 0U,
+    /* MODEL_BC5 */
+    KHR_DF_CHANNEL_BC5_RED   = 0U,
+    KHR_DF_CHANNEL_BC5_R     = 0U,
+    KHR_DF_CHANNEL_BC5_GREEN = 1U,
+    KHR_DF_CHANNEL_BC5_G     = 1U,
+    /* MODEL_BC6H */
+    KHR_DF_CHANNEL_BC6H_COLOR = 0U,
+    KHR_DF_CHANNEL_BC6H_DATA = 0U,
+    /* MODEL_BC7 */
+    KHR_DF_CHANNEL_BC7_DATA = 0U,
+    KHR_DF_CHANNEL_BC7_COLOR = 0U,
+    /* MODEL_ETC1 */
+    KHR_DF_CHANNEL_ETC1_DATA  = 0U,
+    KHR_DF_CHANNEL_ETC1_COLOR = 0U,
+    /* MODEL_ETC2 */
+    KHR_DF_CHANNEL_ETC2_RED   = 0U,
+    KHR_DF_CHANNEL_ETC2_R     = 0U,
+    KHR_DF_CHANNEL_ETC2_GREEN = 1U,
+    KHR_DF_CHANNEL_ETC2_G     = 1U,
+    KHR_DF_CHANNEL_ETC2_COLOR = 2U,
+    KHR_DF_CHANNEL_ETC2_ALPHA = 15U,
+    KHR_DF_CHANNEL_ETC2_A     = 15U,
+    /* MODEL_ASTC */
+    KHR_DF_CHANNEL_ASTC_DATA  = 0U,
+    /* MODEL_ETC1S */
+    KHR_DF_CHANNEL_ETC1S_RGB   = 0U,
+    KHR_DF_CHANNEL_ETC1S_RRR   = 3U,
+    KHR_DF_CHANNEL_ETC1S_GGG   = 4U,
+    KHR_DF_CHANNEL_ETC1S_AAA   = 15U,
+    /* MODEL_PVRTC */
+    KHR_DF_CHANNEL_PVRTC_DATA  = 0U,
+    KHR_DF_CHANNEL_PVRTC_COLOR = 0U,
+    /* MODEL_PVRTC2 */
+    KHR_DF_CHANNEL_PVRTC2_DATA  = 0U,
+    KHR_DF_CHANNEL_PVRTC2_COLOR = 0U,
+    /* MODEL UASTC */
+    KHR_DF_CHANNEL_UASTC_DATA  = 0U,
+    KHR_DF_CHANNEL_UASTC_RGB   = 0U,
+    KHR_DF_CHANNEL_UASTC_RGBA  = 3U,
+    KHR_DF_CHANNEL_UASTC_RRR   = 4U,
+    KHR_DF_CHANNEL_UASTC_RRRG  = 5U,
+    KHR_DF_CHANNEL_UASTC_RG    = 6U,
+
+    /* Common channel names shared by multiple formats */
+    KHR_DF_CHANNEL_COMMON_LUMA    =  0U,
+    KHR_DF_CHANNEL_COMMON_L       =  0U,
+    KHR_DF_CHANNEL_COMMON_STENCIL = 13U,
+    KHR_DF_CHANNEL_COMMON_S       = 13U,
+    KHR_DF_CHANNEL_COMMON_DEPTH   = 14U,
+    KHR_DF_CHANNEL_COMMON_D       = 14U,
+    KHR_DF_CHANNEL_COMMON_ALPHA   = 15U,
+    KHR_DF_CHANNEL_COMMON_A       = 15U
+} khr_df_model_channels_e;
+
+/* Definition of the primary colors in color coordinates.
+   This is implicitly responsible for defining the conversion
+   between RGB an YUV color spaces.
+   LAB and related absolute color models should use
+   KHR_DF_PRIMARIES_CIEXYZ. */
+typedef enum _khr_df_primaries_e {
+    /* No color primaries defined */
+    KHR_DF_PRIMARIES_UNSPECIFIED = 0U,
+    /* Color primaries of ITU-R BT.709 and sRGB */
+    KHR_DF_PRIMARIES_BT709       = 1U,
+    /* Synonym for KHR_DF_PRIMARIES_BT709 */
+    KHR_DF_PRIMARIES_SRGB        = 1U,
+    /* Color primaries of ITU-R BT.601 (625-line EBU variant) */
+    KHR_DF_PRIMARIES_BT601_EBU   = 2U,
+    /* Color primaries of ITU-R BT.601 (525-line SMPTE C variant) */
+    KHR_DF_PRIMARIES_BT601_SMPTE = 3U,
+    /* Color primaries of ITU-R BT.2020 */
+    KHR_DF_PRIMARIES_BT2020      = 4U,
+    /* CIE theoretical color coordinate space */
+    KHR_DF_PRIMARIES_CIEXYZ      = 5U,
+    /* Academy Color Encoding System primaries */
+    KHR_DF_PRIMARIES_ACES        = 6U,
+    /* Color primaries of ACEScc */
+    KHR_DF_PRIMARIES_ACESCC      = 7U,
+    /* Legacy NTSC 1953 primaries */
+    KHR_DF_PRIMARIES_NTSC1953    = 8U,
+    /* Legacy PAL 525-line primaries */
+    KHR_DF_PRIMARIES_PAL525      = 9U,
+    /* Color primaries of Display P3 */
+    KHR_DF_PRIMARIES_DISPLAYP3   = 10U,
+    /* Color primaries of Adobe RGB (1998) */
+    KHR_DF_PRIMARIES_ADOBERGB    = 11U,
+    KHR_DF_PRIMARIES_MAX         = 0xFFU
+} khr_df_primaries_e;
+
+/* Definition of the optical to digital transfer function
+   ("gamma correction"). Most transfer functions are not a pure
+   power function and also include a linear element.
+   LAB and related absolute color representations should use
+   KHR_DF_TRANSFER_UNSPECIFIED. */
+typedef enum _khr_df_transfer_e {
+    /* No transfer function defined */
+    KHR_DF_TRANSFER_UNSPECIFIED = 0U,
+    /* Linear transfer function (value proportional to intensity) */
+    KHR_DF_TRANSFER_LINEAR      = 1U,
+    /* Perceptually-linear transfer function of sRGH (~2.4) */
+    KHR_DF_TRANSFER_SRGB        = 2U,
+    /* Perceptually-linear transfer function of ITU BT.601, BT.709 and BT.2020 (~1/.45) */
+    KHR_DF_TRANSFER_ITU         = 3U,
+    /* SMTPE170M (digital NTSC) defines an alias for the ITU transfer function (~1/.45) */
+    KHR_DF_TRANSFER_SMTPE170M   = 3U,
+    /* Perceptually-linear gamma function of original NTSC (simple 2.2 gamma) */
+    KHR_DF_TRANSFER_NTSC        = 4U,
+    /* Sony S-log used by Sony video cameras */
+    KHR_DF_TRANSFER_SLOG        = 5U,
+    /* Sony S-log 2 used by Sony video cameras */
+    KHR_DF_TRANSFER_SLOG2       = 6U,
+    /* ITU BT.1886 EOTF */
+    KHR_DF_TRANSFER_BT1886      = 7U,
+    /* ITU BT.2100 HLG OETF */
+    KHR_DF_TRANSFER_HLG_OETF    = 8U,
+    /* ITU BT.2100 HLG EOTF */
+    KHR_DF_TRANSFER_HLG_EOTF    = 9U,
+    /* ITU BT.2100 PQ EOTF */
+    KHR_DF_TRANSFER_PQ_EOTF     = 10U,
+    /* ITU BT.2100 PQ OETF */
+    KHR_DF_TRANSFER_PQ_OETF     = 11U,
+    /* DCI P3 transfer function */
+    KHR_DF_TRANSFER_DCIP3       = 12U,
+    /* Legacy PAL OETF */
+    KHR_DF_TRANSFER_PAL_OETF    = 13U,
+    /* Legacy PAL 625-line EOTF */
+    KHR_DF_TRANSFER_PAL625_EOTF = 14U,
+    /* Legacy ST240 transfer function */
+    KHR_DF_TRANSFER_ST240       = 15U,
+    /* ACEScc transfer function */
+    KHR_DF_TRANSFER_ACESCC      = 16U,
+    /* ACEScct transfer function */
+    KHR_DF_TRANSFER_ACESCCT     = 17U,
+    /* Adobe RGB (1998) transfer function */
+    KHR_DF_TRANSFER_ADOBERGB    = 18U,
+    KHR_DF_TRANSFER_MAX         = 0xFFU
+} khr_df_transfer_e;
+
+typedef enum _khr_df_flags_e {
+    KHR_DF_FLAG_ALPHA_STRAIGHT      = 0U,
+    KHR_DF_FLAG_ALPHA_PREMULTIPLIED = 1U
+} khr_df_flags_e;
+
+typedef enum _khr_df_sample_datatype_qualifiers_e {
+    KHR_DF_SAMPLE_DATATYPE_LINEAR = 1U << 4U,
+    KHR_DF_SAMPLE_DATATYPE_EXPONENT = 1U << 5U,
+    KHR_DF_SAMPLE_DATATYPE_SIGNED = 1U << 6U,
+    KHR_DF_SAMPLE_DATATYPE_FLOAT = 1U << 7U
+} khr_df_sample_datatype_qualifiers_e;
+
+#endif
diff --git a/Source/Main.cpp b/Source/Main.cpp
new file mode 100644
index 0000000..6170b1e
--- /dev/null
+++ b/Source/Main.cpp
@@ -0,0 +1,619 @@
+#include <iostream>
+#include <string>
+#include <map>
+#include <tuple>
+#include <algorithm>
+#include <vulkan/vulkan.hpp>
+#include <math.h>
+#include <thread>
+#include <mutex>
+#include <numeric>
+#include <fstream>
+#include <iomanip>
+#include <vector>
+#include "stb_image_resize.h"
+#include "stb_image.h"
+#include "dfd.h"
+#include "ispc_texcomp/ispc_texcomp.h"
+#include "HalfFloat.h"
+
+const std::vector<std::string> formatOrder = {
+  "BC1",
+  "BC1_SRGB",
+  "BC4",
+  "BC5",
+  "BC3",
+  "BC3_SRGB",
+  "BC6H",
+  "BC7",
+  "BC7_SRGB"
+};
+
+const std::map<std::string, std::tuple<std::string, int, vk::Format>> formats = {
+  {"BC1", {"(DXT1) 5:6:5 Color, 1 bit alpha. 8 bytes per block.", 8, vk::Format::eBc1RgbUnormBlock}},
+  {"BC1_SRGB", {"(DXT1) 5:6:5 Color, 1 bit alpha. 8 bytes per block.", 8, vk::Format::eBc1RgbSrgbBlock}},
+  {"BC4", {"Greyscale, 8 bytes per block.", 8, vk::Format::eBc4UnormBlock}},
+  {"BC5", {"2x BC4 images. 16 bytes per block.", 16, vk::Format::eBc5UnormBlock}},
+  {"BC3", {"(DXT5) BC1 Color, BC4 Alpha, 16 bytes per block.", 16, vk::Format::eBc3UnormBlock}},
+  {"BC3_SRGB", {"(DXT5) BC1 Color, BC4 Alpha, 16 bytes per block.", 16, vk::Format::eBc3SrgbBlock}},
+  {"BC6H", {"16 bit RGB, no alpha. Signed. 16 bytes per block.", 16, vk::Format::eBc6HUfloatBlock}},
+  {"BC7", {"8 bit RGBA - Good general purpose. 16 bytes per block.", 16, vk::Format::eBc7UnormBlock}},
+  {"BC7_SRGB", {"8 bit RGBA - Good general purpose. 16 bytes per block.", 16, vk::Format::eBc7SrgbBlock}}
+};
+
+const std::string usage = "Usage: TextureConverter [cube|array] <input> [input2, input3...] <output> <format> [fast|slow|veryslow]";
+
+int main(int argc, char ** argv)
+{
+  ISPCInit();
+
+  if (argc < 4) {
+    std::cout << usage << std::endl;
+    std::cout << "Formats:" << std::endl;
+    for (auto & formatName : formatOrder) {
+      auto format = formats.at(formatName);
+      std::cout << "  " << formatName << " - " << std::get<0>(format) << std::endl;
+    }
+    return 1;
+  }
+
+  int numInputs = argc - 3;
+  unsigned int inputsStart = 1;
+  std::vector<std::string> inputs;
+  std::string output;
+  std::string option(argv[1]);
+  if (option == "cube" || option == "array") {
+    numInputs -= 1;
+    inputsStart += 1;
+  } else {
+    option = "none";
+  }
+
+  std::string speed(argv[argc - 1]);
+  std::string formatString;
+  bool fastMode = true;
+  bool verySlow = false;
+  if (speed == "fast" || speed == "slow" || speed == "veryslow") {
+    formatString = std::string(argv[argc - 2]);
+    numInputs -= 1;
+
+    if (speed == "slow") {
+      fastMode = false;
+    } else if (speed == "veryslow") {
+      fastMode = false;
+      verySlow = true;
+    }
+  } else {
+    formatString = std::string(argv[argc - 1]);
+  }
+
+  if (numInputs < 1) {
+    std::cout << usage << std::endl;
+    return 1;
+  }
+
+  if (option == "cube" && numInputs != 6) {
+    std::cout << "Cube maps must have 6 inputs." << std::endl;
+    return 1;
+  }
+
+  if (option == "array" && numInputs < 2) {
+    std::cout << "Array maps must have at least 2 inputs." << std::endl;
+    return 1;
+  }
+
+  if (formats.find(formatString) == formats.end()) {
+    std::cout << "Invalid format: " << formatString << std::endl;
+    std::cout << usage << std::endl;
+    std::cout << "Formats:" << std::endl;
+    for (auto & formatName : formatOrder) {
+      auto format = formats.at(formatName);
+      std::cout << "  " << formatName << " - " << std::get<0>(format) << std::endl;
+    }
+    return 1;
+  }
+
+  /* Check if it ends in SRGB */
+  bool srgb = false;
+  if (formatString.length() > 5 && formatString.substr(formatString.length() - 5, 5) == "_SRGB") {
+    srgb = true;
+  }
+
+  bool hdr = false;
+  if (formatString.substr(0, 3) == "BC6") {
+    hdr = true;
+  }
+
+  for (int i = inputsStart; i < (int)(inputsStart + numInputs); i++) {
+    inputs.push_back(std::string(argv[i]));
+  }
+
+  output = std::string(argv[inputsStart + numInputs]);
+
+  /* Print inputs and output */
+  std::cout << "Inputs: " << std::endl;
+  for (auto & input : inputs) {
+    std::cout << "  " << input << std::endl;
+  }
+  std::cout << "Output: " << output << std::endl;
+  std::cout << "Format: " << formatString << std::endl;
+  std::cout << "Speed: " << (fastMode ? "Fast" : verySlow ? "Very slow" : "Slow") << std::endl;
+
+  int isa;
+  isa = ISPCIsa();
+
+  std::string isaName;
+  switch(isa) {
+    case 0:
+      isaName = "SSE2";
+      break;
+    case 1:
+      isaName = "SSE4";
+      break;
+    case 2:
+      isaName = "AVX2";
+      break;
+    default:
+      isaName = "Unknown";
+  };
+
+  std::cout << "ISPC ISA: " << isaName << std::endl;
+
+  unsigned char * ldrBufferA, * ldrBufferB, * ldrBufferMain, * ldrBufferOther;
+  float * hdrBufferA, * hdrBufferB, * hdrBufferMain, * hdrBufferOther;
+  int width, height, channels;
+
+  int copyChannels = 4;
+  int forcedChannels = 4;
+  if (formatString == "BC4") {
+    copyChannels = 1;
+  } else if (formatString == "BC5") {
+    copyChannels = 2;
+  }
+
+  std::vector<std::vector<std::vector<unsigned char>>> ldrLevels;
+  std::vector<std::vector<std::vector<float>>> hdrLevels;
+
+  std::vector<std::vector<std::vector<std::vector<unsigned char>>>> ldrLevelBlocks;
+  std::vector<std::vector<std::vector<std::vector<uint16_t>>>> hdrLevelBlocks;
+
+  if (hdr) {
+    hdrLevels.resize(numInputs);
+    hdrLevelBlocks.resize(numInputs);
+  } else {
+    ldrLevels.resize(numInputs);
+    ldrLevelBlocks.resize(numInputs);
+  }
+
+  uint32_t levelCount;
+
+  for (int input = 0; input < numInputs; input++) {
+    int level = 0;
+
+    std::cout << "Loading/scaling " << input << ": " << inputs[input] << std::endl;
+
+    if (hdr) {
+      hdrBufferA = stbi_loadf(inputs[input].c_str(), &width, &height, &channels, forcedChannels);
+      if (hdrBufferA == nullptr) {
+        std::cout << "Failed to load image: " << inputs[input] << std::endl;
+        return 1;
+      }
+
+      hdrBufferB = new float[width * height * forcedChannels];
+      hdrBufferMain = hdrBufferA;
+      hdrBufferOther = hdrBufferB;
+    } else {
+      ldrBufferA = stbi_load(inputs[input].c_str(), &width, &height, &channels, forcedChannels);
+      if (ldrBufferA == nullptr) {
+        std::cout << "Failed to load image: " << inputs[input] << std::endl;
+        return 1;
+      }
+
+      ldrBufferB = new unsigned char[width * height * forcedChannels];
+      ldrBufferMain = ldrBufferA;
+      ldrBufferOther = ldrBufferB;
+    }
+
+    int oldWidth = width;
+    int oldHeight = height;
+
+    levelCount = 1;
+    {
+      int levelWidth = width;
+      int levelHeight = height;
+      while (levelWidth > 1 || levelHeight > 1) {
+        levelWidth = std::max(1, (int)floorf((float)levelWidth / 2));
+        levelHeight = std::max(1, (int)floorf((float)levelHeight / 2));
+        levelCount++;
+      }
+    }
+    
+    while(1) {
+      if (hdr) {
+        hdrLevels[input].push_back(std::vector<float>(hdrBufferMain, hdrBufferMain + oldWidth * oldHeight * forcedChannels));
+      } else {
+        ldrLevels[input].push_back(std::vector<uint8_t>(ldrBufferMain, ldrBufferMain + oldWidth * oldHeight * forcedChannels));
+      }
+
+      if (oldWidth == 1 && oldHeight == 1) {
+        break;
+      }
+
+      int newWidth = std::max(1, (int)floorf((float)oldWidth / 2));
+      int newHeight = std::max(1, (int)floorf((float)oldHeight / 2));
+
+      stbir_colorspace colorspace = srgb ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR;
+      int alphaChannel = channels == 4 ? 3 : STBIR_ALPHA_CHANNEL_NONE;
+
+      if (hdr) {
+        int rv = stbir_resize_float_generic(hdrBufferMain, oldWidth, oldHeight, 0, hdrBufferOther, newWidth, newHeight, 0, forcedChannels, alphaChannel, 0, STBIR_EDGE_CLAMP, STBIR_FILTER_MITCHELL, colorspace, nullptr);
+        if (rv != 1) {
+          std::cerr << "Error resizing" << std::endl;
+        }
+        std::swap(hdrBufferMain, hdrBufferOther);
+      } else {
+        int rv = stbir_resize_uint8_generic(ldrBufferMain, oldWidth, oldHeight, 0, ldrBufferOther, newWidth, newHeight, 0, forcedChannels, alphaChannel, 0, STBIR_EDGE_CLAMP, STBIR_FILTER_MITCHELL, colorspace, nullptr);
+        if (rv != 1) {
+          std::cerr << "Error resizing" << std::endl;
+        }
+        std::swap(ldrBufferMain, ldrBufferOther);
+      }
+
+      oldWidth = newWidth;
+      oldHeight = newHeight;
+      level++;
+    }
+
+    oldWidth = width;
+    oldHeight = height;
+    level = 0;
+    if (hdr) {
+      free(hdrBufferA);
+       delete[] hdrBufferB;
+    } else {
+      free(ldrBufferA);
+      delete[] ldrBufferB;
+    }
+  }
+
+  bc6h_enc_settings bc6henc;
+  bc7_enc_settings bc7enc;
+
+  if (formatString == "BC6H") {
+    if (fastMode) {
+      GetProfile_bc6h_basic(&bc6henc);
+    } else {
+      if (verySlow) {
+        GetProfile_bc6h_veryslow(&bc6henc);
+      } else {
+        GetProfile_bc6h_slow(&bc6henc);
+      }
+    }
+  } else {
+    if (channels == 3) {
+      if (fastMode) {
+        GetProfile_basic(&bc7enc);
+      } else {
+        GetProfile_slow(&bc7enc);
+      }
+    } else {
+      if (fastMode) {
+        GetProfile_alpha_basic(&bc7enc);
+      } else {
+        GetProfile_alpha_slow(&bc7enc);
+      }
+    }
+  }
+
+  std::vector<std::vector<std::vector<unsigned char>>> levelBlocksCompressed(numInputs);
+  std::tuple<std::string, int, vk::Format> format = formats.find(formatString)->second;
+  size_t blockSize = std::get<1>(format);
+
+  for (int input = 0; input < numInputs; input++) {
+    if (numInputs > 1) {
+      if (option == "cube") {
+        std::cout << "Face " << input << std::endl;
+      } else {
+        std::cout << "Layer " << input << std::endl;
+      }
+    }
+
+    if (hdr) {
+      hdrLevelBlocks[input].resize(levelCount);
+    } else {
+      ldrLevelBlocks[input].resize(levelCount);
+    }
+    int level = 0;
+
+    int oldWidth = width;
+    int oldHeight = height;
+
+    while(1) {
+      unsigned int blocksWidth = (oldWidth + 3) / 4;
+      unsigned int blocksHeight = (oldHeight + 3) / 4;
+
+      if (hdr) {
+        hdrLevelBlocks[input][level].resize(blocksWidth * blocksHeight);
+
+        for (unsigned int y = 0; y < blocksHeight; y++) {
+          for (unsigned int x = 0; x < blocksWidth; x++) {
+            std::vector<uint16_t> block(16 * copyChannels);
+
+            for (unsigned int pixelY = y * 4; pixelY < y * 4 + 4; pixelY++) {
+              for (unsigned int pixelX = x * 4; pixelX < x * 4 + 4; pixelX++) {
+                unsigned int clampedY = std::min(pixelY, (unsigned int)oldHeight - 1);
+                unsigned int clampedX = std::min(pixelX, (unsigned int)oldWidth - 1);
+
+                for (int channel = 0; channel < copyChannels; channel++) {
+                  float value = hdrLevels[input][level][(clampedY * oldWidth + clampedX) * forcedChannels + channel];
+                  if (value < 0.0f) {
+                    value = 0.0f;
+                  }
+
+                  if (value > 65504.0f) {
+                    value = 65504.0f;
+                  }
+
+                  block[((pixelY % 4) * 4 + (pixelX % 4)) * copyChannels + channel] = HalfFloat::FromFloat(value);
+                }
+              }
+            }
+
+            hdrLevelBlocks[input][level][blocksWidth * y + x] = block;
+          }
+        }
+      } else {
+        ldrLevelBlocks[input][level].resize(blocksWidth * blocksHeight);
+
+        for (unsigned int y = 0; y < blocksHeight; y++) {
+          for (unsigned int x = 0; x < blocksWidth; x++) {
+            std::vector<uint8_t> block(16 * copyChannels);
+
+            for (unsigned int pixelY = y * 4; pixelY < y * 4 + 4; pixelY++) {
+              for (unsigned int pixelX = x * 4; pixelX < x * 4 + 4; pixelX++) {
+                unsigned int clampedY = std::min(pixelY, (unsigned int)oldHeight - 1);
+                unsigned int clampedX = std::min(pixelX, (unsigned int)oldWidth - 1);
+
+                for (int channel = 0; channel < copyChannels; channel++) {
+                  block[((pixelY % 4) * 4 + (pixelX % 4)) * copyChannels + channel] = ldrLevels[input][level][(clampedY * oldWidth + clampedX) * forcedChannels + channel];
+                }
+              }
+            }
+
+            ldrLevelBlocks[input][level][blocksWidth * y + x] = block;
+          }
+        }
+      }
+
+      if (oldWidth == 1 && oldHeight == 1) {
+        break;
+      }
+
+      oldWidth = std::max(1, (int)floorf((float)oldWidth / 2));
+      oldHeight = std::max(1, (int)floorf((float)oldHeight / 2));      
+      level++;
+    }
+
+
+    /* Compress */
+    levelBlocksCompressed[input].resize(levelCount);
+    for (unsigned int l = 0; l < levelCount; l++) {
+      if (hdr) {
+        levelBlocksCompressed[input][l].resize(hdrLevelBlocks[input][l].size() * blockSize);
+      } else {
+        levelBlocksCompressed[input][l].resize(ldrLevelBlocks[input][l].size() * blockSize);
+      }
+    }
+
+    std::mutex mutex;
+    std::vector<unsigned int> completedBlocks(levelCount, 0);
+
+    unsigned int numThreads = std::thread::hardware_concurrency();
+    std::vector<std::thread> threads;
+    unsigned int maxLevel = 0;
+    for (unsigned t = 0; t < numThreads; t++) {
+      threads.push_back(std::thread([&, t](){
+        for (unsigned int l = 0; l < levelCount; l++) {
+          unsigned int blocksPerThread;
+          if (hdr) {
+            blocksPerThread = hdrLevelBlocks[input][l].size() / numThreads;
+          } else {
+            blocksPerThread = ldrLevelBlocks[input][l].size() / numThreads;
+          }
+          unsigned int startBlock = t * blocksPerThread;
+          unsigned int endBlock = startBlock + blocksPerThread;
+
+          if (hdr) {
+            if (t == numThreads - 1) {
+              endBlock = hdrLevelBlocks[input][l].size();
+            }
+          } else {
+            if (t == numThreads - 1) {
+              endBlock = ldrLevelBlocks[input][l].size();
+            }
+          }
+
+          for (unsigned int b = startBlock; b < endBlock; b++) {
+            if (formatString == "BC6H") {
+              rgba_surface surface;
+              surface.ptr = (uint8_t *)hdrLevelBlocks[input][l][b].data();
+              surface.width = 4;
+              surface.height = 4;
+              surface.stride = copyChannels * 4 * 2;
+
+              CompressBlocksBC6H(&surface, &levelBlocksCompressed[input][l][b * blockSize], &bc6henc);
+            } else {
+              rgba_surface surface;
+              surface.ptr = ldrLevelBlocks[input][l][b].data();
+              surface.width = 4;
+              surface.height = 4;
+              surface.stride = copyChannels * 4;
+
+              if (formatString == "BC1" || formatString == "BC1_SRGB") {
+                CompressBlocksBC1(&surface, &levelBlocksCompressed[input][l][b * blockSize]);
+              } else if (formatString == "BC3" || formatString == "BC3_SRGB") {
+                CompressBlocksBC3(&surface, &levelBlocksCompressed[input][l][b * blockSize]);
+              } else if (formatString == "BC4") {
+                CompressBlocksBC4(&surface, &levelBlocksCompressed[input][l][b * blockSize]);
+              } else if (formatString == "BC5") {
+                CompressBlocksBC5(&surface, &levelBlocksCompressed[input][l][b * blockSize]);
+              } else if (formatString == "BC7" || formatString == "BC7_SRGB") {
+                CompressBlocksBC7(&surface, &levelBlocksCompressed[input][l][b * blockSize], &bc7enc);
+              }
+            }
+
+            std::lock_guard<std::mutex> lock(mutex);
+            completedBlocks[l]++;
+            if (completedBlocks[l] % 100 == 0) {
+              maxLevel = std::max(l, maxLevel);
+              if (l == maxLevel) {
+                float progress;
+                if (hdr) {
+                  progress = (float)completedBlocks[l] / hdrLevelBlocks[input][l].size();
+                } else {
+                  progress = (float)completedBlocks[l] / ldrLevelBlocks[input][l].size();
+                }
+                int barWidth = 70;
+
+                std::cout << std::setw(2) << l << " [";
+                int pos = barWidth * progress;
+                for (int i = 0; i < barWidth; ++i) {
+                    if (i < pos) std::cout << "=";
+                    else if (i == pos) std::cout << ">";
+                    else std::cout << " ";
+                }
+                std::cout << "] " << std::setw(2) << int(progress * 100.0) << " %\r";
+                std::cout.flush();
+              }
+            }
+          }
+        }
+      }));
+    }
+
+    for (unsigned t = 0; t < numThreads; t++) {
+      threads[t].join();
+    }
+
+    if (hdr) {
+      hdrLevelBlocks[input].clear();
+    } else {
+      ldrLevelBlocks[input].clear();
+    }
+
+    int barWidth = 70;
+    std::cout << std::setw(2) << (levelCount - 1) << " [";
+    int pos = barWidth;
+    for (int i = 0; i < barWidth; ++i) {
+        if (i < pos) std::cout << "=";
+        else if (i == pos) std::cout << ">";
+        else std::cout << " ";
+    }
+    std::cout << "] " << std::setw(2) << 100 << " %\r";
+    std::cout.flush();
+
+    std::cout << std::endl;
+  }
+
+  /* Write KTX2 */
+  std::ofstream fh (output, std::ios::out | std::ios::binary);
+  if (!fh.is_open()) {
+    std::cout << "Failed to open output file: " << output << std::endl;
+    return 1;
+  }
+
+  const uint8_t identifier[] = {0xAB, 0x4B, 0x54, 0x58, 0x20, 0x32, 0x30, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A};  
+  fh.write((char *)identifier, sizeof(identifier));
+  vk::Format vkformat = std::get<2>(formats.at(formatString));
+  fh.write((char *)&vkformat, sizeof(vkformat));
+  uint32_t typeSize = 1; // Fix for uncompressed vkformats, size of an individual component
+  fh.write((char *)&typeSize, sizeof(typeSize));
+  uint32_t pixelWidth = width;
+  uint32_t pixelHeight = height;
+  uint32_t pixelDepth = 0;
+
+  uint32_t layerCount;
+  uint32_t faceCount;
+  if (numInputs > 1) {
+    if (option == "cube") {
+      layerCount = 0;
+      faceCount = 6;
+    } else {
+      layerCount = numInputs;
+      faceCount = 1;
+    }
+  } else {
+    layerCount = 0;
+    faceCount = 1;
+  }
+  uint32_t supercompressionScheme = 0;
+  fh.write((char *)&pixelWidth, sizeof(pixelWidth));
+  fh.write((char *)&pixelHeight, sizeof(pixelHeight));
+  fh.write((char *)&pixelDepth, sizeof(pixelDepth));
+  fh.write((char *)&layerCount, sizeof(layerCount));
+  fh.write((char *)&faceCount, sizeof(faceCount));
+  fh.write((char *)&levelCount, sizeof(levelCount));
+  fh.write((char *)&supercompressionScheme, sizeof(supercompressionScheme));
+
+  uint32_t * dfd = vk2dfd(*(VkFormat *)&vkformat);
+
+  uint32_t dfdByteOffset = 0;
+  uint32_t dfdByteLength = dfd[0];
+  uint32_t kvdByteOffset = 0;
+  uint32_t kvdByteLength = 0;
+  uint64_t sgdByteOffset = 0;
+  uint64_t sgdByteLength = 0;
+
+  auto dfdByteOffsetPosition = fh.tellp();
+
+  fh.write((char *)&dfdByteOffset, sizeof(dfdByteOffset));
+  fh.write((char *)&dfdByteLength, sizeof(dfdByteLength));
+  fh.write((char *)&kvdByteOffset, sizeof(kvdByteOffset));
+  fh.write((char *)&kvdByteLength, sizeof(kvdByteLength));
+  fh.write((char *)&sgdByteOffset, sizeof(sgdByteOffset));
+  fh.write((char *)&sgdByteLength, sizeof(sgdByteLength));
+
+  auto levelOffsetBytePosition = fh.tellp();
+  for (unsigned int i = 0; i < levelCount; i++) {
+    uint64_t byteOffset = 0;
+    uint64_t byteLength = 0;
+    uint64_t uncompressedByteLength = 0;
+    fh.write((char *)&byteOffset, sizeof(byteOffset));
+    fh.write((char *)&byteLength, sizeof(byteLength));
+    fh.write((char *)&uncompressedByteLength, sizeof(uncompressedByteLength));
+  }
+
+  dfdByteOffset = fh.tellp();
+  fh.write((char *)dfd, dfdByteLength);
+  free(dfd);
+
+  kvdByteOffset = fh.tellp();
+  fh.seekp(dfdByteOffsetPosition);
+  fh.write((char *)&dfdByteOffset, sizeof(dfdByteOffset));
+  fh.seekp(kvdByteOffset);
+
+  size_t alignment = std::lcm((size_t)4, (size_t)std::get<1>(formats.at(formatString)));
+
+  for (int level = levelCount - 1; level >= 0; level--) {
+    // Alignment
+    while (fh.tellp() % alignment != 0) {
+      uint8_t padding = 0;
+      fh.write((char *)&padding, sizeof(padding));
+    }
+
+    auto levelBytePosition = fh.tellp();
+    fh.seekp(levelOffsetBytePosition + (std::ofstream::pos_type)(level * 24));
+    uint64_t byteOffset = levelBytePosition;
+    fh.write((char *)&byteOffset, sizeof(byteOffset));
+    uint64_t byteLength = levelBlocksCompressed[0][level].size() * numInputs;
+    fh.write((char *)&byteLength, sizeof(byteLength));
+    fh.write((char *)&byteLength, sizeof(byteLength));
+    fh.seekp(levelBytePosition);
+
+    for (int input = 0; input < numInputs; input++) {
+      fh.write((char *)levelBlocksCompressed[input][level].data(), levelBlocksCompressed[input][level].size());
+    }
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/Source/createdfd.cpp b/Source/createdfd.cpp
new file mode 100644
index 0000000..ea00d8d
--- /dev/null
+++ b/Source/createdfd.cpp
@@ -0,0 +1,659 @@
+/* -*- tab-width: 4; -*- */
+/* vi: set sw=2 ts=4 expandtab: */
+
+/* Copyright 2019-2020 The Khronos Group Inc.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @file
+ * @~English
+ * @brief Utilities for creating data format descriptors.
+ */
+
+/*
+ * Author: Andrew Garrard
+ */
+
+#include <stdlib.h>
+#include <KHR/khr_df.h>
+
+#include "dfd.h"
+
+typedef enum { i_COLOR, i_NON_COLOR } channels_infotype;
+
+static uint32_t *writeHeader(int numSamples, int bytes, int suffix,
+                             channels_infotype infotype)
+{
+    uint32_t *DFD = (uint32_t *) malloc(sizeof(uint32_t) *
+                                        (1 + KHR_DF_WORD_SAMPLESTART +
+                                         numSamples * KHR_DF_WORD_SAMPLEWORDS));
+    uint32_t* BDFD = DFD+1;
+    DFD[0] = sizeof(uint32_t) *
+        (1 + KHR_DF_WORD_SAMPLESTART +
+         numSamples * KHR_DF_WORD_SAMPLEWORDS);
+    BDFD[KHR_DF_WORD_VENDORID] =
+        (KHR_DF_VENDORID_KHRONOS << KHR_DF_SHIFT_VENDORID) |
+        (KHR_DF_KHR_DESCRIPTORTYPE_BASICFORMAT << KHR_DF_SHIFT_DESCRIPTORTYPE);
+    BDFD[KHR_DF_WORD_VERSIONNUMBER] =
+        (KHR_DF_VERSIONNUMBER_LATEST << KHR_DF_SHIFT_VERSIONNUMBER) |
+        (((uint32_t)sizeof(uint32_t) *
+          (KHR_DF_WORD_SAMPLESTART +
+           numSamples * KHR_DF_WORD_SAMPLEWORDS)
+          << KHR_DF_SHIFT_DESCRIPTORBLOCKSIZE));
+    BDFD[KHR_DF_WORD_MODEL] =
+        ((KHR_DF_MODEL_RGBSDA << KHR_DF_SHIFT_MODEL) | /* Only supported model */
+         (KHR_DF_FLAG_ALPHA_STRAIGHT << KHR_DF_SHIFT_FLAGS));
+    if (infotype == i_COLOR) {
+        BDFD[KHR_DF_WORD_PRIMARIES] |= KHR_DF_PRIMARIES_BT709 << KHR_DF_SHIFT_PRIMARIES; /* Assumed */
+    } else {
+        BDFD[KHR_DF_WORD_PRIMARIES] |= KHR_DF_PRIMARIES_UNSPECIFIED << KHR_DF_SHIFT_PRIMARIES;
+    }
+    if (suffix == s_SRGB) {
+        BDFD[KHR_DF_WORD_TRANSFER] |= KHR_DF_TRANSFER_SRGB << KHR_DF_SHIFT_TRANSFER;
+    } else {
+        BDFD[KHR_DF_WORD_TRANSFER] |= KHR_DF_TRANSFER_LINEAR << KHR_DF_SHIFT_TRANSFER;
+    }
+    BDFD[KHR_DF_WORD_TEXELBLOCKDIMENSION0] = 0; /* Only 1x1x1x1 texel blocks supported */
+    BDFD[KHR_DF_WORD_BYTESPLANE0] = bytes; /* bytesPlane0 = bytes, bytesPlane3..1 = 0 */
+    BDFD[KHR_DF_WORD_BYTESPLANE4] = 0; /* bytesPlane7..5 = 0 */
+    return DFD;
+}
+
+static uint32_t setChannelFlags(uint32_t channel, enum VkSuffix suffix)
+{
+    switch (suffix) {
+    case s_UNORM: break;
+    case s_SNORM:
+        channel |=
+            KHR_DF_SAMPLE_DATATYPE_SIGNED;
+        break;
+    case s_USCALED: break;
+    case s_SSCALED:
+        channel |=
+            KHR_DF_SAMPLE_DATATYPE_SIGNED;
+        break;
+    case s_UINT: break;
+    case s_SINT:
+        channel |=
+            KHR_DF_SAMPLE_DATATYPE_SIGNED;
+        break;
+    case s_SFLOAT:
+        channel |=
+            KHR_DF_SAMPLE_DATATYPE_FLOAT |
+            KHR_DF_SAMPLE_DATATYPE_SIGNED;
+        break;
+    case s_UFLOAT:
+        channel |=
+            KHR_DF_SAMPLE_DATATYPE_FLOAT;
+        break;
+    case s_SRGB:
+        if (channel == KHR_DF_CHANNEL_RGBSDA_ALPHA) {
+            channel |= KHR_DF_SAMPLE_DATATYPE_LINEAR;
+        }
+        break;
+    }
+    return channel;
+}
+
+static void writeSample(uint32_t *DFD, int sampleNo, int channel,
+                        int bits, int offset,
+                        int topSample, int bottomSample, enum VkSuffix suffix)
+{
+    // Use this to avoid type-punning complaints from the gcc optimizer
+    // with -Wall.
+    union {
+        uint32_t i;
+        float f;
+    } lower, upper;
+    uint32_t *sample = DFD + 1 + KHR_DF_WORD_SAMPLESTART + sampleNo * KHR_DF_WORD_SAMPLEWORDS;
+    if (channel == 3) channel = KHR_DF_CHANNEL_RGBSDA_ALPHA;
+
+    if (channel == 3) channel = KHR_DF_CHANNEL_RGBSDA_ALPHA;
+    channel = setChannelFlags(channel, suffix);
+
+    sample[KHR_DF_SAMPLEWORD_BITOFFSET] =
+        (offset << KHR_DF_SAMPLESHIFT_BITOFFSET) |
+        ((bits - 1) << KHR_DF_SAMPLESHIFT_BITLENGTH) |
+        (channel << KHR_DF_SAMPLESHIFT_CHANNELID);
+
+    sample[KHR_DF_SAMPLEWORD_SAMPLEPOSITION_ALL] = 0;
+
+    switch (suffix) {
+    case s_UNORM:
+    case s_SRGB:
+    default:
+        if (bits > 32) {
+            upper.i = 0xFFFFFFFFU;
+        } else {
+            upper.i = (uint32_t)((1U << bits) - 1U);
+        }
+        lower.i = 0U;
+        break;
+    case s_SNORM:
+        if (bits > 32) {
+            upper.i = 0x7FFFFFFF;
+        } else {
+            upper.i = topSample ? (1U << (bits - 1)) - 1 : (1U << bits) - 1;
+        }
+        lower.i = ~upper.i;
+        if (bottomSample) lower.i += 1;
+        break;
+    case s_USCALED:
+    case s_UINT:
+        upper.i = bottomSample ? 1U : 0U;
+        lower.i = 0U;
+        break;
+    case s_SSCALED:
+    case s_SINT:
+        upper.i = bottomSample ? 1U : 0U;
+        lower.i = ~0U;
+        break;
+    case s_SFLOAT:
+        upper.f = 1.0f;
+        lower.f = -1.0f;
+        break;
+    case s_UFLOAT:
+        upper.f = 1.0f;
+        lower.f = 0.0f;
+        break;
+    }
+    sample[KHR_DF_SAMPLEWORD_SAMPLELOWER] = lower.i;
+    sample[KHR_DF_SAMPLEWORD_SAMPLEUPPER] = upper.i;
+}
+
+/**
+ * @~English
+ * @brief Create a Data Format Descriptor for an unpacked format.
+ *
+ * @param bigEndian Set to 1 for big-endian byte ordering and
+                    0 for little-endian byte ordering.
+ * @param numChannels The number of color channels.
+ * @param bytes The number of bytes per channel.
+ * @param redBlueSwap Normally channels appear in consecutive R, G, B, A order
+ *                    in memory; redBlueSwap inverts red and blue, allowing
+ *                    B, G, R, A.
+ * @param suffix Indicates the format suffix for the type.
+ *
+ * @return A data format descriptor in malloc'd data. The caller is responsible
+ *         for freeing the descriptor.
+ **/
+uint32_t *createDFDUnpacked(int bigEndian, int numChannels, int bytes,
+                            int redBlueSwap, enum VkSuffix suffix)
+{
+    uint32_t *DFD;
+    if (bigEndian) {
+        int channelCounter, channelByte;
+        /* Number of samples = number of channels * bytes per channel */
+        DFD = writeHeader(numChannels * bytes, numChannels * bytes, suffix, i_COLOR);
+        /* First loop over the channels */
+        for (channelCounter = 0; channelCounter < numChannels; ++channelCounter) {
+            int channel = channelCounter;
+            if (redBlueSwap && (channel == 0 || channel == 2)) {
+                channel ^= 2;
+            }
+            /* Loop over the bytes that constitute a channel */
+            for (channelByte = 0; channelByte < bytes; ++channelByte) {
+                writeSample(DFD, channelCounter * bytes + channelByte, channel,
+                            8, 8 * (channelCounter * bytes + bytes - channelByte - 1),
+                            channelByte == bytes-1, channelByte == 0, suffix);
+            }
+        }
+
+    } else { /* Little-endian */
+
+        int sampleCounter;
+        /* One sample per channel */
+        DFD = writeHeader(numChannels, numChannels * bytes, suffix, i_COLOR);
+        for (sampleCounter = 0; sampleCounter < numChannels; ++sampleCounter) {
+            int channel = sampleCounter;
+            if (redBlueSwap && (channel == 0 || channel == 2)) {
+                channel ^= 2;
+            }
+            writeSample(DFD, sampleCounter, channel,
+                        8 * bytes, 8 * sampleCounter * bytes,
+                        1, 1, suffix);
+        }
+    }
+    return DFD;
+}
+
+/**
+ * @~English
+ * @brief Create a Data Format Descriptor for a packed format.
+ *
+ * @param bigEndian Big-endian flag: Set to 1 for big-endian byte ordering and
+ *                  0 for little-endian byte ordering.
+ * @param numChannels The number of color channels.
+ * @param bits[] An array of length numChannels.
+ *               Each entry is the number of bits composing the channel, in
+ *               order starting at bit 0 of the packed type.
+ * @param channels[] An array of length numChannels.
+ *                   Each entry enumerates the channel type: 0 = red, 1 = green,
+ *                   2 = blue, 15 = alpha, in order starting at bit 0 of the
+ *                   packed type. These values match channel IDs for RGBSDA in
+ *                   the Khronos Data Format header. To simplify iteration
+ *                   through channels, channel id 3 is a synonym for alpha.
+ * @param suffix Indicates the format suffix for the type.
+ *
+ * @return A data format descriptor in malloc'd data. The caller is responsible
+ *         for freeing the descriptor.
+ **/
+uint32_t *createDFDPacked(int bigEndian, int numChannels,
+                          int bits[], int channels[],
+                          enum VkSuffix suffix)
+{
+    uint32_t *DFD = 0;
+    if (numChannels == 6) {
+        /* Special case E5B9G9R9 */
+        DFD = writeHeader(numChannels, 4, s_UFLOAT, i_COLOR);
+        writeSample(DFD, 0, 0,
+                    9, 0,
+                    1, 1, s_UNORM);
+        KHR_DFDSETSVAL((DFD+1), 0, SAMPLEUPPER, 8448);
+        writeSample(DFD, 1, 0 | KHR_DF_SAMPLE_DATATYPE_EXPONENT,
+                    5, 27,
+                    1, 1, s_UNORM);
+        KHR_DFDSETSVAL((DFD+1), 1, SAMPLELOWER, 15);
+        KHR_DFDSETSVAL((DFD+1), 1, SAMPLEUPPER, 31);
+        writeSample(DFD, 2, 1,
+                    9, 9,
+                    1, 1, s_UNORM);
+        KHR_DFDSETSVAL((DFD+1), 2, SAMPLEUPPER, 8448);
+        writeSample(DFD, 3, 1 | KHR_DF_SAMPLE_DATATYPE_EXPONENT,
+                    5, 27,
+                    1, 1, s_UNORM);
+        KHR_DFDSETSVAL((DFD+1), 3, SAMPLELOWER, 15);
+        KHR_DFDSETSVAL((DFD+1), 3, SAMPLEUPPER, 31);
+        writeSample(DFD, 4, 2,
+                    9, 18,
+                    1, 1, s_UNORM);
+        KHR_DFDSETSVAL((DFD+1), 4, SAMPLEUPPER, 8448);
+        writeSample(DFD, 5, 2 | KHR_DF_SAMPLE_DATATYPE_EXPONENT,
+                    5, 27,
+                    1, 1, s_UNORM);
+        KHR_DFDSETSVAL((DFD+1), 5, SAMPLELOWER, 15);
+        KHR_DFDSETSVAL((DFD+1), 5, SAMPLEUPPER, 31);
+    } else if (bigEndian) {
+        /* No packed format is larger than 32 bits. */
+        /* No packed channel crosses more than two bytes. */
+        int totalBits = 0;
+        int bitChannel[32];
+        int beChannelStart[4];
+        int channelCounter;
+        int bitOffset = 0;
+        int BEMask;
+        int numSamples = numChannels;
+        int sampleCounter;
+        for (channelCounter = 0; channelCounter < numChannels; ++channelCounter) {
+            beChannelStart[channelCounter] = totalBits;
+            totalBits += bits[channelCounter];
+        }
+        BEMask = (totalBits - 1) & 0x18;
+        for (channelCounter = 0; channelCounter < numChannels; ++channelCounter) {
+            bitChannel[bitOffset ^ BEMask] = channelCounter;
+            if (((bitOffset + bits[channelCounter] - 1) & ~7) != (bitOffset & ~7)) {
+                /* Continuation sample */
+                bitChannel[((bitOffset + bits[channelCounter] - 1) & ~7) ^ BEMask] = channelCounter;
+                numSamples++;
+            }
+            bitOffset += bits[channelCounter];
+        }
+        DFD = writeHeader(numSamples, totalBits >> 3, suffix, i_COLOR);
+
+        sampleCounter = 0;
+        for (bitOffset = 0; bitOffset < totalBits;) {
+            if (bitChannel[bitOffset] == -1) {
+                /* Done this bit, so this is the lower half of something. */
+                /* We must therefore jump to the end of the byte and continue. */
+                bitOffset = (bitOffset + 8) & ~7;
+            } else {
+                /* Start of a channel? */
+                int thisChannel = bitChannel[bitOffset];
+                if ((beChannelStart[thisChannel] ^ BEMask) == bitOffset) {
+                    /* Must be just one sample if we hit it first. */
+                    writeSample(DFD, sampleCounter++, channels[thisChannel],
+                                    bits[thisChannel], bitOffset,
+                                    1, 1, suffix);
+                    bitOffset += bits[thisChannel];
+                } else {
+                    /* Two samples. Move to the end of the first one we hit when we're done. */
+                    int firstSampleBits = 8 - (beChannelStart[thisChannel] & 0x7); /* Rest of the byte */
+                    int secondSampleBits = bits[thisChannel] - firstSampleBits; /* Rest of the bits */
+                    writeSample(DFD, sampleCounter++, channels[thisChannel],
+                                firstSampleBits, beChannelStart[thisChannel] ^ BEMask,
+                                0, 1, suffix);
+                    /* Mark that we've already handled this sample */
+                    bitChannel[beChannelStart[thisChannel] ^ BEMask] = -1;
+                    writeSample(DFD, sampleCounter++, channels[thisChannel],
+                                secondSampleBits, bitOffset,
+                                1, 0, suffix);
+                    bitOffset += secondSampleBits;
+                }
+            }
+        }
+
+    } else { /* Little-endian */
+
+        int sampleCounter;
+        int totalBits = 0;
+        int bitOffset = 0;
+        for (sampleCounter = 0; sampleCounter < numChannels; ++sampleCounter) {
+            totalBits += bits[sampleCounter];
+        }
+
+        /* One sample per channel */
+        DFD = writeHeader(numChannels, totalBits >> 3, suffix, i_COLOR);
+        for (sampleCounter = 0; sampleCounter < numChannels; ++sampleCounter) {
+            writeSample(DFD, sampleCounter, channels[sampleCounter],
+                        bits[sampleCounter], bitOffset,
+                        1, 1, suffix);
+            bitOffset += bits[sampleCounter];
+        }
+    }
+    return DFD;
+}
+
+static khr_df_model_e compModelMapping[] = {
+    KHR_DF_MODEL_BC1A,   /*!< BC1, aka DXT1, no alpha. */
+    KHR_DF_MODEL_BC1A,   /*!< BC1, aka DXT1, punch-through alpha. */
+    KHR_DF_MODEL_BC2,    /*!< BC2, aka DXT2 and DXT3. */
+    KHR_DF_MODEL_BC3,    /*!< BC3, aka DXT4 and DXT5. */
+    KHR_DF_MODEL_BC4,    /*!< BC4. */
+    KHR_DF_MODEL_BC5,    /*!< BC5. */
+    KHR_DF_MODEL_BC6H,   /*!< BC6h HDR format. */
+    KHR_DF_MODEL_BC7,    /*!< BC7. */
+    KHR_DF_MODEL_ETC2,   /*!< ETC2 no alpha. */
+    KHR_DF_MODEL_ETC2,   /*!< ETC2 punch-through alpha. */
+    KHR_DF_MODEL_ETC2,   /*!< ETC2 independent alpha. */
+    KHR_DF_MODEL_ETC2,   /*!< R11 ETC2 single-channel. */
+    KHR_DF_MODEL_ETC2,   /*!< R11G11 ETC2 dual-channel. */
+    KHR_DF_MODEL_ASTC,   /*!< ASTC. */
+    KHR_DF_MODEL_ETC1S,  /*!< ETC1S. */
+    KHR_DF_MODEL_PVRTC,  /*!< PVRTC(1). */
+    KHR_DF_MODEL_PVRTC2  /*!< PVRTC2. */
+};
+
+static uint32_t compSampleCount[] = {
+    1U, /*!< BC1, aka DXT1, no alpha. */
+    1U, /*!< BC1, aka DXT1, punch-through alpha. */
+    2U, /*!< BC2, aka DXT2 and DXT3. */
+    2U, /*!< BC3, aka DXT4 and DXT5. */
+    1U, /*!< BC4. */
+    2U, /*!< BC5. */
+    1U, /*!< BC6h HDR format. */
+    1U, /*!< BC7. */
+    1U, /*!< ETC2 no alpha. */
+    2U, /*!< ETC2 punch-through alpha. */
+    2U, /*!< ETC2 independent alpha. */
+    1U, /*!< R11 ETC2 single-channel. */
+    2U, /*!< R11G11 ETC2 dual-channel. */
+    1U, /*!< ASTC. */
+    1U, /*!< ETC1S. */
+    1U, /*!< PVRTC. */
+    1U  /*!< PVRTC2. */
+};
+
+static khr_df_model_channels_e compFirstChannel[] = {
+    KHR_DF_CHANNEL_BC1A_COLOR,        /*!< BC1, aka DXT1, no alpha. */
+    KHR_DF_CHANNEL_BC1A_ALPHAPRESENT, /*!< BC1, aka DXT1, punch-through alpha. */
+    KHR_DF_CHANNEL_BC2_ALPHA,         /*!< BC2, aka DXT2 and DXT3. */
+    KHR_DF_CHANNEL_BC3_ALPHA,         /*!< BC3, aka DXT4 and DXT5. */
+    KHR_DF_CHANNEL_BC4_DATA,          /*!< BC4. */
+    KHR_DF_CHANNEL_BC5_RED,           /*!< BC5. */
+    KHR_DF_CHANNEL_BC6H_COLOR,        /*!< BC6h HDR format. */
+    KHR_DF_CHANNEL_BC7_COLOR,         /*!< BC7. */
+    KHR_DF_CHANNEL_ETC2_COLOR,        /*!< ETC2 no alpha. */
+    KHR_DF_CHANNEL_ETC2_COLOR,        /*!< ETC2 punch-through alpha. */
+    KHR_DF_CHANNEL_ETC2_ALPHA,        /*!< ETC2 independent alpha. */
+    KHR_DF_CHANNEL_ETC2_RED,          /*!< R11 ETC2 single-channel. */
+    KHR_DF_CHANNEL_ETC2_RED,          /*!< R11G11 ETC2 dual-channel. */
+    KHR_DF_CHANNEL_ASTC_DATA,         /*!< ASTC. */
+    KHR_DF_CHANNEL_ETC1S_RGB,         /*!< ETC1S. */
+    KHR_DF_CHANNEL_PVRTC_COLOR,       /*!< PVRTC. */
+    KHR_DF_CHANNEL_PVRTC2_COLOR       /*!< PVRTC2. */
+};
+
+static khr_df_model_channels_e compSecondChannel[] = {
+    KHR_DF_CHANNEL_BC1A_COLOR,        /*!< BC1, aka DXT1, no alpha. */
+    KHR_DF_CHANNEL_BC1A_ALPHAPRESENT, /*!< BC1, aka DXT1, punch-through alpha. */
+    KHR_DF_CHANNEL_BC2_COLOR,         /*!< BC2, aka DXT2 and DXT3. */
+    KHR_DF_CHANNEL_BC3_COLOR,         /*!< BC3, aka DXT4 and DXT5. */
+    KHR_DF_CHANNEL_BC4_DATA,          /*!< BC4. */
+    KHR_DF_CHANNEL_BC5_GREEN,         /*!< BC5. */
+    KHR_DF_CHANNEL_BC6H_COLOR,        /*!< BC6h HDR format. */
+    KHR_DF_CHANNEL_BC7_COLOR,         /*!< BC7. */
+    KHR_DF_CHANNEL_ETC2_COLOR,        /*!< ETC2 no alpha. */
+    KHR_DF_CHANNEL_ETC2_ALPHA,        /*!< ETC2 punch-through alpha. */
+    KHR_DF_CHANNEL_ETC2_COLOR,        /*!< ETC2 independent alpha. */
+    KHR_DF_CHANNEL_ETC2_RED,          /*!< R11 ETC2 single-channel. */
+    KHR_DF_CHANNEL_ETC2_GREEN,        /*!< R11G11 ETC2 dual-channel. */
+    KHR_DF_CHANNEL_ASTC_DATA,         /*!< ASTC. */
+    KHR_DF_CHANNEL_ETC1S_RGB,         /*!< ETC1S. */
+    KHR_DF_CHANNEL_PVRTC_COLOR,       /*!< PVRTC. */
+    KHR_DF_CHANNEL_PVRTC2_COLOR       /*!< PVRTC2. */
+};
+
+static uint32_t compSecondChannelOffset[] = {
+    0U,  /*!< BC1, aka DXT1, no alpha. */
+    0U,  /*!< BC1, aka DXT1, punch-through alpha. */
+    64U, /*!< BC2, aka DXT2 and DXT3. */
+    64U, /*!< BC3, aka DXT4 and DXT5. */
+    0U,  /*!< BC4. */
+    64U, /*!< BC5. */
+    0U,  /*!< BC6h HDR format. */
+    0U,  /*!< BC7. */
+    0U,  /*!< ETC2 no alpha. */
+    0U,  /*!< ETC2 punch-through alpha. */
+    64U, /*!< ETC2 independent alpha. */
+    0U,  /*!< R11 ETC2 single-channel. */
+    64U, /*!< R11G11 ETC2 dual-channel. */
+    0U,  /*!< ASTC. */
+    0U,  /*!< ETC1S. */
+    0U,  /*!< PVRTC. */
+    0U   /*!< PVRTC2. */
+};
+
+static uint32_t compChannelBits[] = {
+    64U,  /*!< BC1, aka DXT1, no alpha. */
+    64U,  /*!< BC1, aka DXT1, punch-through alpha. */
+    64U,  /*!< BC2, aka DXT2 and DXT3. */
+    64U,  /*!< BC3, aka DXT4 and DXT5. */
+    64U,  /*!< BC4. */
+    64U,  /*!< BC5. */
+    128U, /*!< BC6h HDR format. */
+    128U, /*!< BC7. */
+    64U,  /*!< ETC2 no alpha. */
+    64U,  /*!< ETC2 punch-through alpha. */
+    64U,  /*!< ETC2 independent alpha. */
+    64U,  /*!< R11 ETC2 single-channel. */
+    64U,  /*!< R11G11 ETC2 dual-channel. */
+    128U, /*!< ASTC. */
+    64U,  /*!< ETC1S. */
+    64U,  /*!< PVRTC. */
+    64U   /*!< PVRTC2. */
+};
+
+static uint32_t compBytes[] = {
+    8U,  /*!< BC1, aka DXT1, no alpha. */
+    8U,  /*!< BC1, aka DXT1, punch-through alpha. */
+    16U, /*!< BC2, aka DXT2 and DXT3. */
+    16U, /*!< BC3, aka DXT4 and DXT5. */
+    8U,  /*!< BC4. */
+    16U, /*!< BC5. */
+    16U, /*!< BC6h HDR format. */
+    16U, /*!< BC7. */
+    8U,  /*!< ETC2 no alpha. */
+    8U,  /*!< ETC2 punch-through alpha. */
+    16U, /*!< ETC2 independent alpha. */
+    8U,  /*!< R11 ETC2 single-channel. */
+    16U, /*!< R11G11 ETC2 dual-channel. */
+    16U, /*!< ASTC. */
+    8U,  /*!< ETC1S. */
+    8U,  /*!< PVRTC. */
+    8U   /*!< PVRTC2. */
+};
+
+/**
+ * @~English
+ * @brief Create a Data Format Descriptor for a compressed format.
+ *
+ * @param compScheme Vulkan-style compression scheme enumeration.
+ * @param bwidth Block width in texel coordinates.
+ * @param bheight Block height in texel coordinates.
+ * @param bdepth Block depth in texel coordinates.
+ * @author Mark Callow, Edgewise Consulting.
+ * @param suffix Indicates the format suffix for the type.
+ *
+ * @return A data format descriptor in malloc'd data. The caller is responsible
+ *         for freeing the descriptor.
+ **/
+uint32_t *createDFDCompressed(enum VkCompScheme compScheme, int bwidth, int bheight, int bdepth,
+                              enum VkSuffix suffix)
+{
+    uint32_t *DFD = 0;
+    uint32_t numSamples = compSampleCount[compScheme];
+    uint32_t* BDFD;
+    uint32_t *sample;
+    uint32_t channel;
+    // Use union to avoid type-punning complaints from gcc optimizer
+    // with -Wall.
+    union {
+        uint32_t i;
+        float f;
+    } lower, upper;
+
+    DFD = (uint32_t *) malloc(sizeof(uint32_t) *
+                              (1 + KHR_DF_WORD_SAMPLESTART +
+                               numSamples * KHR_DF_WORD_SAMPLEWORDS));
+    BDFD = DFD+1;
+    DFD[0] = sizeof(uint32_t) *
+        (1 + KHR_DF_WORD_SAMPLESTART +
+         numSamples * KHR_DF_WORD_SAMPLEWORDS);
+    BDFD[KHR_DF_WORD_VENDORID] =
+        (KHR_DF_VENDORID_KHRONOS << KHR_DF_SHIFT_VENDORID) |
+        (KHR_DF_KHR_DESCRIPTORTYPE_BASICFORMAT << KHR_DF_SHIFT_DESCRIPTORTYPE);
+    BDFD[KHR_DF_WORD_VERSIONNUMBER] =
+        (KHR_DF_VERSIONNUMBER_LATEST << KHR_DF_SHIFT_VERSIONNUMBER) |
+        (((uint32_t)sizeof(uint32_t) *
+          (KHR_DF_WORD_SAMPLESTART +
+           numSamples * KHR_DF_WORD_SAMPLEWORDS)
+          << KHR_DF_SHIFT_DESCRIPTORBLOCKSIZE));
+    BDFD[KHR_DF_WORD_MODEL] =
+        ((compModelMapping[compScheme] << KHR_DF_SHIFT_MODEL) |
+         (KHR_DF_PRIMARIES_BT709 << KHR_DF_SHIFT_PRIMARIES) | /* Assumed */
+         (KHR_DF_FLAG_ALPHA_STRAIGHT << KHR_DF_SHIFT_FLAGS));
+
+    if (suffix == s_SRGB) {
+        BDFD[KHR_DF_WORD_TRANSFER] |= KHR_DF_TRANSFER_SRGB << KHR_DF_SHIFT_TRANSFER;
+    } else {
+        BDFD[KHR_DF_WORD_TRANSFER] |= KHR_DF_TRANSFER_LINEAR << KHR_DF_SHIFT_TRANSFER;
+    }
+    BDFD[KHR_DF_WORD_TEXELBLOCKDIMENSION0] =
+        (bwidth - 1) | ((bheight - 1) << KHR_DF_SHIFT_TEXELBLOCKDIMENSION1) | ((bdepth - 1) << KHR_DF_SHIFT_TEXELBLOCKDIMENSION2);
+    /* bytesPlane0 = bytes, bytesPlane3..1 = 0 */
+    BDFD[KHR_DF_WORD_BYTESPLANE0] = compBytes[compScheme];
+    BDFD[KHR_DF_WORD_BYTESPLANE4] = 0; /* bytesPlane7..5 = 0 */
+
+    sample = BDFD + KHR_DF_WORD_SAMPLESTART;
+    channel = compFirstChannel[compScheme];
+    channel = setChannelFlags(channel, suffix);
+
+    sample[KHR_DF_SAMPLEWORD_BITOFFSET] =
+        (0 << KHR_DF_SAMPLESHIFT_BITOFFSET) |
+        ((compChannelBits[compScheme] - 1) << KHR_DF_SAMPLESHIFT_BITLENGTH) |
+        (channel << KHR_DF_SAMPLESHIFT_CHANNELID);
+
+    sample[KHR_DF_SAMPLEWORD_SAMPLEPOSITION_ALL] = 0;
+    switch (suffix) {
+    case s_UNORM:
+    case s_SRGB:
+    default:
+        upper.i = 0xFFFFFFFFU;
+        lower.i = 0U;
+        break;
+    case s_SNORM:
+        upper.i = 0x7FFFFFFF;
+        lower.i = ~upper.i;
+        break;
+    case s_USCALED:
+    case s_UINT:
+        upper.i = 1U;
+        lower.i = 0U;
+        break;
+    case s_SSCALED:
+    case s_SINT:
+        upper.i = 1U;
+        lower.i = ~0U;
+        break;
+    case s_SFLOAT:
+        upper.f = 1.0f;
+        lower.f = -1.0f;
+        break;
+    case s_UFLOAT:
+        upper.f = 1.0f;
+        lower.f = 0.0f;
+        break;
+    }
+    sample[KHR_DF_SAMPLEWORD_SAMPLELOWER] = lower.i;
+    sample[KHR_DF_SAMPLEWORD_SAMPLEUPPER] = upper.i;
+
+    if (compSampleCount[compScheme] > 1) {
+        sample += KHR_DF_WORD_SAMPLEWORDS;
+        channel = compSecondChannel[compScheme];
+        channel = setChannelFlags(channel, suffix);
+
+        sample[KHR_DF_SAMPLEWORD_BITOFFSET] =
+            (compSecondChannelOffset[compScheme] << KHR_DF_SAMPLESHIFT_BITOFFSET) |
+            ((compChannelBits[compScheme] - 1) << KHR_DF_SAMPLESHIFT_BITLENGTH) |
+            (channel << KHR_DF_SAMPLESHIFT_CHANNELID);
+
+        sample[KHR_DF_SAMPLEWORD_SAMPLEPOSITION_ALL] = 0;
+
+        sample[KHR_DF_SAMPLEWORD_SAMPLELOWER] = lower.i;
+        sample[KHR_DF_SAMPLEWORD_SAMPLEUPPER] = upper.i;
+    }
+    return DFD;
+}
+
+/**
+ * @~English
+ * @brief Create a Data Format Descriptor for a depth-stencil format.
+ *
+ * @param depthBits   The numeber of bits in the depth channel.
+ * @param stencilBits The numeber of bits in the stencil channel.
+ * @param sizeBytes   The total byte size of the texel.
+ *
+ * @return A data format descriptor in malloc'd data. The caller is responsible
+ *         for freeing the descriptor.
+ **/
+uint32_t *createDFDDepthStencil(int depthBits,
+                                int stencilBits,
+                                int sizeBytes)
+{
+    /* N.B. Little-endian is assumed. */
+    uint32_t *DFD = 0;
+    DFD = writeHeader((depthBits > 0) + (stencilBits > 0),
+                      sizeBytes, s_UNORM, i_NON_COLOR);
+    if (depthBits == 32) {
+        writeSample(DFD, 0, KHR_DF_CHANNEL_RGBSDA_DEPTH,
+                    32, 0,
+                    1, 1, s_SFLOAT);
+    } else if (depthBits > 0) {
+        writeSample(DFD, 0, KHR_DF_CHANNEL_RGBSDA_DEPTH,
+                    depthBits, 0,
+                    1, 1, s_UNORM);
+    }
+    if (stencilBits > 0) {
+        if (depthBits > 0) {
+            writeSample(DFD, 1, KHR_DF_CHANNEL_RGBSDA_STENCIL,
+                        stencilBits, depthBits,
+                        1, 1, s_UINT);
+        } else {
+            writeSample(DFD, 0, KHR_DF_CHANNEL_RGBSDA_STENCIL,
+                        stencilBits, 0,
+                        1, 1, s_UINT);
+        }
+    }
+    return DFD;
+}
diff --git a/Source/dfd.h b/Source/dfd.h
new file mode 100644
index 0000000..633b475
--- /dev/null
+++ b/Source/dfd.h
@@ -0,0 +1,173 @@
+/* -*- tab-width: 4; -*- */
+/* vi: set sw=2 ts=4 expandtab: */
+
+/* Copyright 2019-2020 The Khronos Group Inc.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @file
+ * @~English
+ * @brief Header file defining the data format descriptor utilities API.
+ */
+
+/*
+ * Author: Andrew Garrard
+ */
+
+#ifndef _DFD_H_
+#define _DFD_H_
+
+#include <vulkan/vulkan.h>
+#include <KHR/khr_df.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Qualifier suffix to the format, in Vulkan terms. */
+enum VkSuffix {
+    s_UNORM,   /*!< Unsigned normalized format. */
+    s_SNORM,   /*!< Signed normalized format. */
+    s_USCALED, /*!< Unsigned scaled format. */
+    s_SSCALED, /*!< Signed scaled format. */
+    s_UINT,    /*!< Unsigned integer format. */
+    s_SINT,    /*!< Signed integer format. */
+    s_SFLOAT,  /*!< Signed float format. */
+    s_UFLOAT,  /*!< Unsigned float format. */
+    s_SRGB     /*!< sRGB normalized format. */
+};
+
+/** Compression scheme, in Vulkan terms. */
+enum VkCompScheme {
+    c_BC1_RGB,       /*!< BC1, aka DXT1, no alpha. */
+    c_BC1_RGBA,      /*!< BC1, aka DXT1, punch-through alpha. */
+    c_BC2,           /*!< BC2, aka DXT2 and DXT3. */
+    c_BC3,           /*!< BC3, aka DXT4 and DXT5. */
+    c_BC4,           /*!< BC4. */
+    c_BC5,           /*!< BC5. */
+    c_BC6H,          /*!< BC6h HDR format. */
+    c_BC7,           /*!< BC7. */
+    c_ETC2_R8G8B8,   /*!< ETC2 no alpha. */
+    c_ETC2_R8G8B8A1, /*!< ETC2 punch-through alpha. */
+    c_ETC2_R8G8B8A8, /*!< ETC2 independent alpha. */
+    c_EAC_R11,       /*!< R11 ETC2 single-channel. */
+    c_EAC_R11G11,    /*!< R11G11 ETC2 dual-channel. */
+    c_ASTC,          /*!< ASTC. */
+    c_ETC1S,         /*!< ETC1S. */
+    c_PVRTC,         /*!< PVRTC(1). */
+    c_PVRTC2         /*!< PVRTC2. */
+};
+
+#if !defined(uint32_t)
+typedef unsigned int uint32_t;
+#endif
+
+#if !defined(LIBKTX)
+#include <vulkan/vulkan_core.h>
+#else
+#include "../vkformat_enum.h"
+#endif
+
+uint32_t* vk2dfd(enum VkFormat format);
+
+/* Create a Data Format Descriptor for an unpacked format. */
+uint32_t *createDFDUnpacked(int bigEndian, int numChannels, int bytes,
+                            int redBlueSwap, enum VkSuffix suffix);
+
+/* Create a Data Format Descriptor for a packed format. */
+uint32_t *createDFDPacked(int bigEndian, int numChannels,
+                          int bits[], int channels[],
+                          enum VkSuffix suffix);
+
+/* Create a Data Format Descriptor for a compressed format. */
+uint32_t *createDFDCompressed(enum VkCompScheme compScheme,
+                              int bwidth, int bheight, int bdepth,
+                              enum VkSuffix suffix);
+
+/* Create a Data Format Descriptor for a depth/stencil format. */
+uint32_t *createDFDDepthStencil(int depthBits,
+                                int stencilBits,
+                                int sizeBytes);
+
+/** @brief Result of interpreting the data format descriptor. */
+enum InterpretDFDResult {
+    i_LITTLE_ENDIAN_FORMAT_BIT = 0, /*!< Confirmed little-endian (default for 8bpc). */
+    i_BIG_ENDIAN_FORMAT_BIT = 1,    /*!< Confirmed big-endian. */
+    i_PACKED_FORMAT_BIT = 2,        /*!< Packed format. */
+    i_SRGB_FORMAT_BIT = 4,          /*!< sRGB transfer function. */
+    i_NORMALIZED_FORMAT_BIT = 8,    /*!< Normalized (UNORM or SNORM). */
+    i_SIGNED_FORMAT_BIT = 16,       /*!< Format is signed. */
+    i_FLOAT_FORMAT_BIT = 32,        /*!< Format is floating point. */
+    i_UNSUPPORTED_ERROR_BIT = 64,   /*!< Format not successfully interpreted. */
+    /** "NONTRIVIAL_ENDIANNESS" means not big-endian, not little-endian
+     * (a channel has bits that are not consecutive in either order). **/
+    i_UNSUPPORTED_NONTRIVIAL_ENDIANNESS     = i_UNSUPPORTED_ERROR_BIT,
+    /** "MULTIPLE_SAMPLE_LOCATIONS" is an error because only single-sample
+     * texel blocks (with coordinates 0,0,0,0 for all samples) are supported. **/
+    i_UNSUPPORTED_MULTIPLE_SAMPLE_LOCATIONS = i_UNSUPPORTED_ERROR_BIT + 1,
+    /** "MULTIPLE_PLANES" is an error because only contiguous data is supported. */
+    i_UNSUPPORTED_MULTIPLE_PLANES           = i_UNSUPPORTED_ERROR_BIT + 2,
+    /** Only channels R, G, B and A are supported. */
+    i_UNSUPPORTED_CHANNEL_TYPES             = i_UNSUPPORTED_ERROR_BIT + 3,
+    /** Only channels with the same flags are supported
+     * (e.g. we don't support float red with integer green). */
+    i_UNSUPPORTED_MIXED_CHANNELS            = i_UNSUPPORTED_ERROR_BIT + 4
+};
+
+/** @brief Interpretation of a channel from the data format descriptor. */
+typedef struct _InterpretedDFDChannel {
+    uint32_t offset; /*!< Offset in bits for packed, bytes for unpacked. */
+    uint32_t size;   /*!< Size in bits for packed, bytes for unpacked. */
+} InterpretedDFDChannel;
+
+/* Interpret a Data Format Descriptor. */
+enum InterpretDFDResult interpretDFD(const uint32_t *DFD,
+                                     InterpretedDFDChannel *R,
+                                     InterpretedDFDChannel *G,
+                                     InterpretedDFDChannel *B,
+                                     InterpretedDFDChannel *A,
+                                     uint32_t *wordBytes);
+
+/* Print a human-readable interpretation of a data format descriptor. */
+void printDFD(uint32_t *DFD);
+
+/* Get the number of components & component size from a DFD for an
+ * unpacked format.
+ */
+void
+getDFDComponentInfoUnpacked(const uint32_t* DFD, uint32_t* numComponents,
+                            uint32_t* componentByteLength);
+
+/* Return the number of components described by a DFD. */
+uint32_t getDFDNumComponents(const uint32_t* DFD);
+
+/* Recreate and return the value of bytesPlane0 as it should be for the data
+ * post-inflation from variable-rate compression.
+ */
+void
+recreateBytesPlane0FromSampleInfo(const uint32_t* DFD, uint32_t* bytesPlane0);
+
+/** @brief Colourspace primaries information.
+ *
+ * Structure to store the 1931 CIE x,y chromaticities of the red, green, and blue
+ * display primaries and the reference white point of a colourspace.
+ */
+typedef struct _Primaries {
+    float Rx; /*!< Red x. */
+    float Ry; /*!< Red y. */
+    float Gx; /*!< Green x. */
+    float Gy; /*!< Green y. */
+    float Bx; /*!< Blue x. */
+    float By; /*!< Blue y. */
+    float Wx; /*!< White x. */
+    float Wy; /*!< White y. */
+} Primaries;
+
+khr_df_primaries_e findMapping(Primaries *p, float latitude);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DFD_H_ */
diff --git a/Source/ispc_texcomp/ispc_texcomp.cpp b/Source/ispc_texcomp/ispc_texcomp.cpp
new file mode 100644
index 0000000..3824020
--- /dev/null
+++ b/Source/ispc_texcomp/ispc_texcomp.cpp
@@ -0,0 +1,557 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-2019, Intel Corporation
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+// IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////
+
+#include "ispc_texcomp.h"
+#include "kernel_ispc.h"
+#include <memory.h> // memcpy
+
+namespace ispc {
+extern "C" {
+  extern int32_t ISPCIsa_ispc_sse4();
+  extern "C" void CompressBlocksBC1_ispc_sse4(const rgba_surface* src, uint8_t* dst);
+  extern "C" void CompressBlocksBC3_ispc_sse4(const rgba_surface* src, uint8_t* dst);
+  extern "C" void CompressBlocksBC4_ispc_sse4(const rgba_surface* src, uint8_t* dst);
+  extern "C" void CompressBlocksBC5_ispc_sse4(const rgba_surface* src, uint8_t* dst);
+  extern "C" void CompressBlocksBC6H_ispc_sse4(const rgba_surface* src, uint8_t* dst, bc6h_enc_settings* settings);
+  extern "C" void CompressBlocksBC7_ispc_sse4(const rgba_surface* src, uint8_t* dst, bc7_enc_settings* settings);
+  extern "C" void CompressBlocksETC1_ispc_sse4(const rgba_surface* src, uint8_t* dst, etc_enc_settings* settings);
+  extern "C" void CompressBlocksASTC_ispc_sse4(const rgba_surface* src, uint8_t* dst, astc_enc_settings* settings);
+}
+}
+
+static bool isAmd = false;
+
+void ISPCInit()
+{
+#if defined(_MSC_VER)
+{
+  int CPUInfo[4];
+  __cpuid(CPUInfo, 0x80000001);
+  isAmd = (CPUInfo[2] & (1 << 6)) != 0;
+}
+#else
+{
+  unsigned int eax = 0x80000001;
+  unsigned int ebx = 0;
+  unsigned int ecx = 0;
+  unsigned int edx = 0;
+  asm volatile("cpuid"
+    : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+    : "a"(eax));
+  isAmd = (ecx & (1 << 6)) != 0;
+}
+#endif 
+}
+
+void GetProfile_ultrafast(bc7_enc_settings* settings)
+{
+    settings->channels = 3;
+
+	// mode02
+	settings->mode_selection[0] = false;
+	settings->skip_mode2 = true;
+
+	settings->refineIterations[0] = 2;
+	settings->refineIterations[2] = 2;
+
+	// mode13
+	settings->mode_selection[1] = false;
+	settings->fastSkipTreshold_mode1 = 3;
+	settings->fastSkipTreshold_mode3 = 1;
+    settings->fastSkipTreshold_mode7 = 0;
+
+	settings->refineIterations[1] = 2;
+	settings->refineIterations[3] = 1;
+
+	// mode45
+	settings->mode_selection[2] = false;
+
+    settings->mode45_channel0 = 0;
+	settings->refineIterations_channel = 0;
+	settings->refineIterations[4] = 2;
+	settings->refineIterations[5] = 2;
+
+	// mode6
+	settings->mode_selection[3] = true;
+
+	settings->refineIterations[6] = 1;
+}
+
+void GetProfile_veryfast(bc7_enc_settings* settings)
+{
+    settings->channels = 3;
+
+	// mode02
+	settings->mode_selection[0] = false;
+	settings->skip_mode2 = true;
+
+	settings->refineIterations[0] = 2;
+	settings->refineIterations[2] = 2;
+
+	// mode13
+	settings->mode_selection[1] = true;
+	settings->fastSkipTreshold_mode1 = 3;
+	settings->fastSkipTreshold_mode3 = 1;
+    settings->fastSkipTreshold_mode7 = 0;
+
+	settings->refineIterations[1] = 2;
+	settings->refineIterations[3] = 1;
+
+	// mode45
+	settings->mode_selection[2] = false;
+
+    settings->mode45_channel0 = 0;
+	settings->refineIterations_channel = 0;
+	settings->refineIterations[4] = 2;
+	settings->refineIterations[5] = 2;
+
+	// mode6
+	settings->mode_selection[3] = true;
+
+	settings->refineIterations[6] = 1;
+}
+
+void GetProfile_fast(bc7_enc_settings* settings)
+{	
+    settings->channels = 3;
+
+	// mode02
+	settings->mode_selection[0] = false;
+	settings->skip_mode2 = true;
+
+	settings->refineIterations[0] = 2;
+	settings->refineIterations[2] = 2;
+
+	// mode13
+	settings->mode_selection[1] = true;
+	settings->fastSkipTreshold_mode1 = 12;
+	settings->fastSkipTreshold_mode3 = 4;
+    settings->fastSkipTreshold_mode7 = 0;
+
+	settings->refineIterations[1] = 2;
+	settings->refineIterations[3] = 1;
+
+	// mode45
+	settings->mode_selection[2] = false;
+
+    settings->mode45_channel0 = 0;
+	settings->refineIterations_channel = 0;
+	settings->refineIterations[4] = 2;
+	settings->refineIterations[5] = 2;
+
+	// mode6
+	settings->mode_selection[3] = true;
+
+	settings->refineIterations[6] = 2;
+}
+
+void GetProfile_basic(bc7_enc_settings* settings)
+{	
+    settings->channels = 3;
+
+	// mode02
+	settings->mode_selection[0] = true;
+	settings->skip_mode2 = true;
+
+	settings->refineIterations[0] = 2;
+	settings->refineIterations[2] = 2;
+
+	// mode13
+	settings->mode_selection[1] = true;
+	settings->fastSkipTreshold_mode1 = 8+4;
+	settings->fastSkipTreshold_mode3 = 8;
+    settings->fastSkipTreshold_mode7 = 0;
+
+	settings->refineIterations[1] = 2;
+	settings->refineIterations[3] = 2;
+
+	// mode45
+	settings->mode_selection[2] = true;
+
+    settings->mode45_channel0 = 0;
+	settings->refineIterations_channel = 2;
+	settings->refineIterations[4] = 2;
+	settings->refineIterations[5] = 2;
+
+	// mode6
+	settings->mode_selection[3] = true;
+
+	settings->refineIterations[6] = 2;
+}
+
+void GetProfile_slow(bc7_enc_settings* settings)
+{	
+    settings->channels = 3;
+
+	int moreRefine = 2;
+	// mode02
+	settings->mode_selection[0] = true;
+	settings->skip_mode2 = false;
+
+	settings->refineIterations[0] = 2+moreRefine;
+	settings->refineIterations[2] = 2+moreRefine;
+
+	// mode13
+	settings->mode_selection[1] = true;
+	settings->fastSkipTreshold_mode1 = 64;
+	settings->fastSkipTreshold_mode3 = 64;
+	settings->fastSkipTreshold_mode7 = 0;
+
+	settings->refineIterations[1] = 2+moreRefine;
+	settings->refineIterations[3] = 2+moreRefine;
+
+	// mode45
+	settings->mode_selection[2] = true;
+
+    settings->mode45_channel0 = 0;
+	settings->refineIterations_channel = 2+moreRefine;
+	settings->refineIterations[4] = 2+moreRefine;
+	settings->refineIterations[5] = 2+moreRefine;
+
+	// mode6
+	settings->mode_selection[3] = true;
+
+	settings->refineIterations[6] = 2+moreRefine;
+}
+
+void GetProfile_alpha_ultrafast(bc7_enc_settings* settings)
+{	
+    settings->channels = 4;
+
+    // mode02
+	settings->mode_selection[0] = false;
+	settings->skip_mode2 = true;
+
+	settings->refineIterations[0] = 2;
+	settings->refineIterations[2] = 2;
+
+	// mode137
+	settings->mode_selection[1] = false;
+	settings->fastSkipTreshold_mode1 = 0;
+	settings->fastSkipTreshold_mode3 = 0;
+    settings->fastSkipTreshold_mode7 = 4;
+
+	settings->refineIterations[1] = 1;
+	settings->refineIterations[3] = 1;
+    settings->refineIterations[7] = 2;
+
+	// mode45
+	settings->mode_selection[2] = true;
+    
+    settings->mode45_channel0 = 3;
+    settings->refineIterations_channel = 1;
+	settings->refineIterations[4] = 1;
+	settings->refineIterations[5] = 1;
+
+	// mode6
+	settings->mode_selection[3] = true;
+
+	settings->refineIterations[6] = 2;
+}
+
+void GetProfile_alpha_veryfast(bc7_enc_settings* settings)
+{	
+    settings->channels = 4;
+
+    // mode02
+	settings->mode_selection[0] = false;
+	settings->skip_mode2 = true;
+
+	settings->refineIterations[0] = 2;
+	settings->refineIterations[2] = 2;
+
+	// mode137
+	settings->mode_selection[1] = true;
+	settings->fastSkipTreshold_mode1 = 0;
+	settings->fastSkipTreshold_mode3 = 0;
+    settings->fastSkipTreshold_mode7 = 4;
+
+	settings->refineIterations[1] = 1;
+	settings->refineIterations[3] = 1;
+    settings->refineIterations[7] = 2;
+
+	// mode45
+	settings->mode_selection[2] = true;
+    
+    settings->mode45_channel0 = 3;
+    settings->refineIterations_channel = 2;
+	settings->refineIterations[4] = 2;
+	settings->refineIterations[5] = 2;
+
+	// mode6
+	settings->mode_selection[3] = true;
+
+	settings->refineIterations[6] = 2;
+}
+
+void GetProfile_alpha_fast(bc7_enc_settings* settings)
+{	
+    settings->channels = 4;
+
+    // mode02
+	settings->mode_selection[0] = false;
+	settings->skip_mode2 = true;
+
+	settings->refineIterations[0] = 2;
+	settings->refineIterations[2] = 2;
+
+	// mode137
+	settings->mode_selection[1] = true;
+	settings->fastSkipTreshold_mode1 = 4;
+	settings->fastSkipTreshold_mode3 = 4;
+    settings->fastSkipTreshold_mode7 = 8;
+
+	settings->refineIterations[1] = 1;
+	settings->refineIterations[3] = 1;
+    settings->refineIterations[7] = 2;
+
+	// mode45
+	settings->mode_selection[2] = true;
+    
+    settings->mode45_channel0 = 3;
+    settings->refineIterations_channel = 2;
+	settings->refineIterations[4] = 2;
+	settings->refineIterations[5] = 2;
+
+	// mode6
+	settings->mode_selection[3] = true;
+
+	settings->refineIterations[6] = 2;
+}
+
+void GetProfile_alpha_basic(bc7_enc_settings* settings)
+{	
+    settings->channels = 4;
+
+    // mode02
+	settings->mode_selection[0] = true;
+	settings->skip_mode2 = true;
+
+	settings->refineIterations[0] = 2;
+	settings->refineIterations[2] = 2;
+
+	// mode137
+	settings->mode_selection[1] = true;
+	settings->fastSkipTreshold_mode1 = 8+4;
+	settings->fastSkipTreshold_mode3 = 8;
+    settings->fastSkipTreshold_mode7 = 8;
+
+	settings->refineIterations[1] = 2;
+	settings->refineIterations[3] = 2;
+    settings->refineIterations[7] = 2;
+
+	// mode45
+	settings->mode_selection[2] = true;
+    
+    settings->mode45_channel0 = 0;
+    settings->refineIterations_channel = 2;
+	settings->refineIterations[4] = 2;
+	settings->refineIterations[5] = 2;
+
+	// mode6
+	settings->mode_selection[3] = true;
+
+	settings->refineIterations[6] = 2;
+}
+
+void GetProfile_alpha_slow(bc7_enc_settings* settings)
+{	
+    settings->channels = 4;
+
+	int moreRefine = 2;
+	// mode02
+	settings->mode_selection[0] = true;
+	settings->skip_mode2 = false;
+
+	settings->refineIterations[0] = 2+moreRefine;
+	settings->refineIterations[2] = 2+moreRefine;
+
+	// mode137
+	settings->mode_selection[1] = true;
+	settings->fastSkipTreshold_mode1 = 64;
+	settings->fastSkipTreshold_mode3 = 64;
+    settings->fastSkipTreshold_mode7 = 64;
+
+	settings->refineIterations[1] = 2+moreRefine;
+	settings->refineIterations[3] = 2+moreRefine;
+	settings->refineIterations[7] = 2+moreRefine;
+
+	// mode45
+	settings->mode_selection[2] = true;
+
+    settings->mode45_channel0 = 0;
+	settings->refineIterations_channel = 2+moreRefine;
+	settings->refineIterations[4] = 2+moreRefine;
+	settings->refineIterations[5] = 2+moreRefine;
+
+	// mode6
+	settings->mode_selection[3] = true;
+
+	settings->refineIterations[6] = 2+moreRefine;
+}
+
+void GetProfile_bc6h_veryfast(bc6h_enc_settings* settings)
+{
+    settings->slow_mode = false;
+    settings->fast_mode = true;
+    settings->fastSkipTreshold = 0;
+    settings->refineIterations_1p = 0;
+    settings->refineIterations_2p = 0;
+}
+
+void GetProfile_bc6h_fast(bc6h_enc_settings* settings)
+{
+    settings->slow_mode = false;
+    settings->fast_mode = true;
+    settings->fastSkipTreshold = 2;
+    settings->refineIterations_1p = 0;
+    settings->refineIterations_2p = 1;
+}
+
+void GetProfile_bc6h_basic(bc6h_enc_settings* settings)
+{
+    settings->slow_mode = false;
+    settings->fast_mode = false;
+    settings->fastSkipTreshold = 4;
+    settings->refineIterations_1p = 2;
+    settings->refineIterations_2p = 2;
+}
+
+void GetProfile_bc6h_slow(bc6h_enc_settings* settings)
+{
+    settings->slow_mode = true;
+    settings->fast_mode = false;
+    settings->fastSkipTreshold = 10;
+    settings->refineIterations_1p = 2;
+    settings->refineIterations_2p = 2;
+}
+
+void GetProfile_bc6h_veryslow(bc6h_enc_settings* settings)
+{
+    settings->slow_mode = true;
+    settings->fast_mode = false;
+    settings->fastSkipTreshold = 32;
+    settings->refineIterations_1p = 2;
+    settings->refineIterations_2p = 2;
+}
+
+void GetProfile_etc_slow(etc_enc_settings* settings)
+{
+    settings->fastSkipTreshold = 6;
+}
+
+void ReplicateBorders(rgba_surface* dst_slice, const rgba_surface* src_tex, int start_x, int start_y, int bpp)
+{
+    int bytes_per_pixel = bpp >> 3;
+    
+    bool aliasing = false;
+    if (&src_tex->ptr[src_tex->stride * start_y + bytes_per_pixel * start_x] == dst_slice->ptr) aliasing = true;
+
+    for (int y = 0; y < dst_slice->height; y++)
+    for (int x = 0; x < dst_slice->width; x++)
+    {
+        int xx = start_x + x;
+        int yy = start_y + y;
+
+        if (aliasing && xx < src_tex->width && yy < src_tex->height) continue;
+
+        if (xx >= src_tex->width) xx = src_tex->width - 1;
+        if (yy >= src_tex->height) yy = src_tex->height - 1;
+
+        void* dst = &dst_slice->ptr[dst_slice->stride * y + bytes_per_pixel * x];
+        void* src = &src_tex->ptr[src_tex->stride * yy + bytes_per_pixel * xx];
+
+        memcpy(dst, src, bytes_per_pixel);
+    }
+}
+
+void CompressBlocksBC1(const rgba_surface* src, uint8_t* dst)
+{
+  if (isAmd) {
+    ispc::CompressBlocksBC1_ispc_sse4((ispc::rgba_surface*)src, dst);  
+  } else {
+    ispc::CompressBlocksBC1_ispc((ispc::rgba_surface*)src, dst);
+  }
+}
+
+void CompressBlocksBC3(const rgba_surface* src, uint8_t* dst)
+{
+  if (isAmd) {
+    ispc::CompressBlocksBC3_ispc_sse4((ispc::rgba_surface*)src, dst);  
+  } else {
+	  ispc::CompressBlocksBC3_ispc((ispc::rgba_surface*)src, dst);
+  }
+}
+
+void CompressBlocksBC4(const rgba_surface* src, uint8_t* dst)
+{
+  if (isAmd) {
+    ispc::CompressBlocksBC4_ispc_sse4((ispc::rgba_surface*)src, dst);  
+  } else {
+	  ispc::CompressBlocksBC4_ispc((ispc::rgba_surface*)src, dst);
+  }
+}
+
+void CompressBlocksBC5(const rgba_surface* src, uint8_t* dst)
+{
+  if (isAmd) {
+    ispc::CompressBlocksBC5_ispc_sse4((ispc::rgba_surface*)src, dst);  
+  } else {
+    ispc::CompressBlocksBC5_ispc((ispc::rgba_surface*)src, dst);
+  }
+}
+
+void CompressBlocksBC7(const rgba_surface* src, uint8_t* dst, bc7_enc_settings* settings)
+{
+  if (isAmd) {
+    ispc::CompressBlocksBC7_ispc_sse4((ispc::rgba_surface*)src, dst, (ispc::bc7_enc_settings*)settings);  
+  } else {
+    ispc::CompressBlocksBC7_ispc((ispc::rgba_surface*)src, dst, (ispc::bc7_enc_settings*)settings);
+  }
+}
+
+void CompressBlocksBC6H(const rgba_surface* src, uint8_t* dst, bc6h_enc_settings* settings)
+{
+  if (isAmd) {
+    ispc::CompressBlocksBC6H_ispc_sse4((ispc::rgba_surface*)src, dst, (ispc::bc6h_enc_settings*)settings);
+  } else {
+    ispc::CompressBlocksBC6H_ispc((ispc::rgba_surface*)src, dst, (ispc::bc6h_enc_settings*)settings);
+  }
+}
+
+void CompressBlocksETC1(const rgba_surface* src, uint8_t* dst, etc_enc_settings* settings)
+{
+  if (isAmd) {
+    ispc::CompressBlocksETC1_ispc_sse4((ispc::rgba_surface*)src, dst, (ispc::etc_enc_settings*)settings);
+  } else {
+    ispc::CompressBlocksETC1_ispc((ispc::rgba_surface*)src, dst, (ispc::etc_enc_settings*)settings);
+  }
+}
+
+int ISPCIsa()
+{
+  if (isAmd) {
+    return ispc::ISPCIsa_ispc_sse4();
+  } else {
+    return ispc::ISPCIsa_ispc();
+  }
+}
\ No newline at end of file
diff --git a/Source/ispc_texcomp/ispc_texcomp.def b/Source/ispc_texcomp/ispc_texcomp.def
new file mode 100644
index 0000000..8f4c26c
--- /dev/null
+++ b/Source/ispc_texcomp/ispc_texcomp.def
@@ -0,0 +1,30 @@
+LIBRARY   ispc_texcomp
+EXPORTS
+	CompressBlocksBC1
+	CompressBlocksBC3
+    CompressBlocksBC4
+    CompressBlocksBC5
+	CompressBlocksBC6H
+	CompressBlocksBC7
+	CompressBlocksETC1
+	CompressBlocksASTC
+	GetProfile_ultrafast
+	GetProfile_veryfast
+	GetProfile_fast
+	GetProfile_basic
+	GetProfile_slow
+	GetProfile_alpha_ultrafast
+	GetProfile_alpha_veryfast
+	GetProfile_alpha_fast
+	GetProfile_alpha_basic
+	GetProfile_alpha_slow
+	GetProfile_bc6h_veryfast
+	GetProfile_bc6h_fast
+	GetProfile_bc6h_basic
+	GetProfile_bc6h_slow
+	GetProfile_bc6h_veryslow
+	GetProfile_etc_slow
+	GetProfile_astc_fast
+	GetProfile_astc_alpha_fast
+	GetProfile_astc_alpha_slow
+	ReplicateBorders
diff --git a/Source/ispc_texcomp/ispc_texcomp.h b/Source/ispc_texcomp/ispc_texcomp.h
new file mode 100644
index 0000000..6caf828
--- /dev/null
+++ b/Source/ispc_texcomp/ispc_texcomp.h
@@ -0,0 +1,128 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-2019, Intel Corporation
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+// IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <stdint.h>
+
+struct rgba_surface
+{
+    uint8_t* ptr;
+    int32_t width;
+    int32_t height;
+    int32_t stride; // in bytes
+};
+
+struct bc7_enc_settings
+{
+    bool mode_selection[4];
+    int refineIterations[8];
+
+    bool skip_mode2;
+    int fastSkipTreshold_mode1;
+    int fastSkipTreshold_mode3;
+    int fastSkipTreshold_mode7;
+
+    int mode45_channel0;
+    int refineIterations_channel;
+
+    int channels;
+};
+
+struct bc6h_enc_settings
+{
+    bool slow_mode;
+    bool fast_mode;
+    int refineIterations_1p;
+    int refineIterations_2p;
+    int fastSkipTreshold;
+};
+
+struct etc_enc_settings
+{
+    int fastSkipTreshold;
+};
+
+struct astc_enc_settings
+{
+    int block_width;
+    int block_height;
+    int channels;
+
+    int fastSkipTreshold;
+    int refineIterations;
+};
+
+// profiles for RGB data (alpha channel will be ignored)
+extern "C" void GetProfile_ultrafast(bc7_enc_settings* settings);
+extern "C" void GetProfile_veryfast(bc7_enc_settings* settings);
+extern "C" void GetProfile_fast(bc7_enc_settings* settings);
+extern "C" void GetProfile_basic(bc7_enc_settings* settings);
+extern "C" void GetProfile_slow(bc7_enc_settings* settings);
+
+// profiles for RGBA inputs
+extern "C" void GetProfile_alpha_ultrafast(bc7_enc_settings* settings);
+extern "C" void GetProfile_alpha_veryfast(bc7_enc_settings* settings);
+extern "C" void GetProfile_alpha_fast(bc7_enc_settings* settings);
+extern "C" void GetProfile_alpha_basic(bc7_enc_settings* settings);
+extern "C" void GetProfile_alpha_slow(bc7_enc_settings* settings);
+
+// profiles for BC6H (RGB HDR)
+extern "C" void GetProfile_bc6h_veryfast(bc6h_enc_settings* settings);
+extern "C" void GetProfile_bc6h_fast(bc6h_enc_settings* settings);
+extern "C" void GetProfile_bc6h_basic(bc6h_enc_settings* settings);
+extern "C" void GetProfile_bc6h_slow(bc6h_enc_settings* settings);
+extern "C" void GetProfile_bc6h_veryslow(bc6h_enc_settings* settings);
+
+// profiles for ETC
+extern "C" void GetProfile_etc_slow(etc_enc_settings* settings);
+
+// profiles for ASTC
+extern "C" void GetProfile_astc_fast(astc_enc_settings* settings, int block_width, int block_height);
+extern "C" void GetProfile_astc_alpha_fast(astc_enc_settings* settings, int block_width, int block_height);
+extern "C" void GetProfile_astc_alpha_slow(astc_enc_settings* settings, int block_width, int block_height);
+
+// helper function to replicate border pixels for the desired block sizes (bpp = 32 or 64)
+extern "C" void ReplicateBorders(rgba_surface* dst_slice, const rgba_surface* src_tex, int x, int y, int bpp);
+
+/*
+Notes:
+    - input width and height need to be a multiple of block size
+    - LDR input is 32 bit/pixel (sRGB), HDR is 64 bit/pixel (half float)
+        - for BC4 input is 8bit/pixel (R8), for BC5 input is 16bit/pixel (RG8)
+    - dst buffer must be allocated with enough space for the compressed texture:
+        - 8 bytes/block for BC1/BC4/ETC1,
+        - 16 bytes/block for BC3/BC5/BC6H/BC7/ASTC
+    - the blocks are stored in raster scan order (natural CPU texture layout)
+    - use the GetProfile_* functions to select various speed/quality tradeoffs
+    - the RGB profiles are slightly faster as they ignore the alpha channel
+*/
+
+extern "C" void CompressBlocksBC1(const rgba_surface* src, uint8_t* dst);
+extern "C" void CompressBlocksBC3(const rgba_surface* src, uint8_t* dst);
+extern "C" void CompressBlocksBC4(const rgba_surface* src, uint8_t* dst);
+extern "C" void CompressBlocksBC5(const rgba_surface* src, uint8_t* dst);
+extern "C" void CompressBlocksBC6H(const rgba_surface* src, uint8_t* dst, bc6h_enc_settings* settings);
+extern "C" void CompressBlocksBC7(const rgba_surface* src, uint8_t* dst, bc7_enc_settings* settings);
+extern "C" void CompressBlocksETC1(const rgba_surface* src, uint8_t* dst, etc_enc_settings* settings);
+extern "C" void CompressBlocksASTC(const rgba_surface* src, uint8_t* dst, astc_enc_settings* settings);
+
+extern "C" void ISPCInit();
+extern "C" int ISPCIsa();
\ No newline at end of file
diff --git a/Source/ispc_texcomp/ispc_texcomp.vcxproj b/Source/ispc_texcomp/ispc_texcomp.vcxproj
new file mode 100644
index 0000000..0713ed1
--- /dev/null
+++ b/Source/ispc_texcomp/ispc_texcomp.vcxproj
@@ -0,0 +1,177 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{9B44F7B9-A9AF-45A4-8695-96792A18B052}</ProjectGuid>
+    <RootNamespace>ispc_texcomp</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ExecutablePath>$(SolutionDir);$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ExecutablePath>$(SolutionDir);$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ExecutablePath>$(SolutionDir);$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ExecutablePath>$(SolutionDir);$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Windows</SubSystem>
+      <ModuleDefinitionFile>ispc_texcomp.def</ModuleDefinitionFile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ModuleDefinitionFile>ispc_texcomp.def</ModuleDefinitionFile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <ModuleDefinitionFile>ispc_texcomp.def</ModuleDefinitionFile>
+      <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <ModuleDefinitionFile>ispc_texcomp.def</ModuleDefinitionFile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="ispc_texcomp.cpp" />
+    <ClCompile Include="ispc_texcomp_astc.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="ispc_texcomp.h" />
+    <ClInclude Include="kernel_astc_ispc.h" />
+    <ClInclude Include="kernel_astc_ispc_avx.h" />
+    <ClInclude Include="kernel_astc_ispc_avx2.h" />
+    <ClInclude Include="kernel_astc_ispc_sse2.h" />
+    <ClInclude Include="kernel_astc_ispc_sse4.h" />
+    <ClInclude Include="kernel_ispc.h" />
+    <ClInclude Include="kernel_ispc_avx.h" />
+    <ClInclude Include="kernel_ispc_avx2.h" />
+    <ClInclude Include="kernel_ispc_sse2.h" />
+    <ClInclude Include="kernel_ispc_sse4.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="kernel.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">"$(ProjectDir)..\ISPC\win\ispc.exe" -O2 "%(Filename).ispc" -o "$(TargetDir)%(Filename).obj" -h "$(ProjectDir)%(Filename)_ispc.h" --arch=x86 --target=sse2,sse4,avx,avx2 --opt=fast-math</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ProjectDir)..\ISPC\win\ispc.exe" -O2 "%(Filename).ispc" -o "$(TargetDir)%(Filename).obj" -h "$(ProjectDir)%(Filename)_ispc.h" --target=sse2,sse4,avx,avx2 --opt=fast-math</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_avx2.obj;</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_avx2.obj;</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">"$(ProjectDir)..\ISPC\win\ispc.exe" -O2 "%(Filename).ispc" -o "$(TargetDir)%(Filename).obj" -h "$(ProjectDir)%(Filename)_ispc.h" --arch=x86 --target=sse2,sse4,avx,avx2 --opt=fast-math</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_avx2.obj;</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ProjectDir)..\ISPC\win\ispc.exe" -O2 "%(Filename).ispc" -o "$(TargetDir)%(Filename).obj" -h "$(ProjectDir)%(Filename)_ispc.h" --target=sse2,sse4,avx,avx2 --opt=fast-math</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_avx2.obj;</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="kernel_astc.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">"$(ProjectDir)..\ISPC\win\ispc.exe" -O2 "%(Filename).ispc" -o "$(TargetDir)%(Filename).obj" -h "$(ProjectDir)%(Filename)_ispc.h" --target=sse2,sse4,avx,avx2 --opt=fast-math</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_avx2.obj;</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">"$(ProjectDir)..\ISPC\win\ispc.exe" -O2 "%(Filename).ispc" -o "$(TargetDir)%(Filename).obj" -h "$(ProjectDir)%(Filename)_ispc.h" --target=avx --opt=fast-math</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">"$(ProjectDir)..\ISPC\win\ispc.exe" -O2 "%(Filename).ispc" -o "$(TargetDir)%(Filename).obj" -h "$(ProjectDir)%(Filename)_ispc.h" --arch=x86 --target=sse2,sse4,avx,avx2 --opt=fast-math</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_avx2.obj;</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">"$(ProjectDir)..\ISPC\win\ispc.exe" -O2 "%(Filename).ispc" -o "$(TargetDir)%(Filename).obj" -h "$(ProjectDir)%(Filename)_ispc.h" --arch=x86 --target=avx --opt=fast-math</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/Source/ispc_texcomp/ispc_texcomp.vcxproj.filters b/Source/ispc_texcomp/ispc_texcomp.vcxproj.filters
new file mode 100644
index 0000000..7a98ac4
--- /dev/null
+++ b/Source/ispc_texcomp/ispc_texcomp.vcxproj.filters
@@ -0,0 +1,62 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Generated Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ispc_texcomp.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ispc_texcomp_astc.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="kernel.ispc">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="kernel_astc.ispc">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="ispc_texcomp.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
+    <ClInclude Include="kernel_astc_ispc_sse2.h">
+      <Filter>Generated Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="kernel_astc_ispc_sse4.h">
+      <Filter>Generated Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="kernel_ispc.h">
+      <Filter>Generated Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="kernel_ispc_avx.h">
+      <Filter>Generated Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="kernel_ispc_avx2.h">
+      <Filter>Generated Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="kernel_ispc_sse2.h">
+      <Filter>Generated Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="kernel_ispc_sse4.h">
+      <Filter>Generated Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="kernel_astc_ispc.h">
+      <Filter>Generated Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="kernel_astc_ispc_avx.h">
+      <Filter>Generated Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="kernel_astc_ispc_avx2.h">
+      <Filter>Generated Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
diff --git a/Source/ispc_texcomp/ispc_texcomp_astc.cpp b/Source/ispc_texcomp/ispc_texcomp_astc.cpp
new file mode 100644
index 0000000..045651a
--- /dev/null
+++ b/Source/ispc_texcomp/ispc_texcomp_astc.cpp
@@ -0,0 +1,564 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016, Intel Corporation
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 
+// documentation files (the "Software"), to deal in the Software without restriction, including without limitation 
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of 
+// the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+// SOFTWARE.
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "ispc_texcomp.h"
+#include "kernel_astc_ispc.h"
+#include <cassert>
+#include <cstring>
+#include <algorithm>
+#include <vector>
+#include <limits>
+
+void GetProfile_astc_fast(astc_enc_settings* settings, int block_width, int block_height)
+{
+    settings->block_width = block_width;
+    settings->block_height = block_height;
+    settings->channels = 3;
+
+    settings->fastSkipTreshold = 5;
+    settings->refineIterations = 2;
+}
+
+void GetProfile_astc_alpha_fast(astc_enc_settings* settings, int block_width, int block_height)
+{
+    settings->block_width = block_width;
+    settings->block_height = block_height;
+    settings->channels = 4;
+
+    settings->fastSkipTreshold = 5;
+    settings->refineIterations = 2;
+}
+
+void GetProfile_astc_alpha_slow(astc_enc_settings* settings, int block_width, int block_height)
+{
+    settings->block_width = block_width;
+    settings->block_height = block_height;
+    settings->channels = 4;
+
+    settings->fastSkipTreshold = 64;
+    settings->refineIterations = 2;
+}
+
+struct astc_block
+{
+    int width;
+    int height;
+    uint8_t dual_plane;
+    int weight_range;
+    uint8_t weights[64];
+    int color_component_selector;
+
+    int partitions;
+    int partition_id;
+    int color_endpoint_pairs;
+    int channels;
+    int color_endpoint_modes[4];
+    int endpoint_range;
+    uint8_t endpoints[18];
+};
+
+bool can_store(int value, int bits)
+{
+    if (value < 0) return false;
+    if (value >= 1 << bits) return false;
+    return true;
+}
+
+int pack_block_mode(astc_block* block)
+{
+    int block_mode = 0;
+
+    int D = block->dual_plane;
+    int H = block->weight_range >= 6;
+    int DH = D * 2 + H;
+    int R = block->weight_range + 2 - ((H > 0) ? 6 : 0);
+    R = R / 2 + R % 2 * 4;
+
+    if (can_store(block->width - 4, 2) && can_store(block->height - 2, 2))
+    {
+        int B = block->width - 4;
+        int A = block->height - 2;
+
+        block_mode = (DH << 9) | (B << 7) | (A << 5) | ((R & 4) << 2) | (R & 3);
+    }
+
+    if (can_store(block->width - 8, 2) && can_store(block->height - 2, 2))
+    {
+        int B = block->width - 8;
+        int A = block->height - 2;
+
+        block_mode = (DH << 9) | (B << 7) | (A << 5) | ((R & 4) << 2) | 4 | (R & 3);
+    }
+
+    if (can_store(block->width - 2, 2) && can_store(block->height - 8, 2))
+    {
+        int A = block->width - 2;
+        int B = block->height - 8;
+
+        block_mode = (DH << 9) | (B << 7) | (A << 5) | ((R & 4) << 2) | 8 | (R & 3);
+    }
+
+    if (can_store(block->width - 2, 2) && can_store(block->height - 6, 1))
+    {
+        int A = block->width - 2;
+        int B = block->height - 6;
+
+        block_mode = (DH << 9) | (B << 7) | (A << 5) | ((R & 4) << 2) | 12 | (R & 3);
+    }
+
+    if (can_store(block->width - 2, 1) && can_store(block->height - 2, 2))
+    {
+        int B = block->width;
+        int A = block->height - 2;
+
+        block_mode = (DH << 9) | (B << 7) | (A << 5) | ((R & 4) << 2) | 12 | (R & 3);
+    }
+
+    if (DH == 0 && can_store(block->width - 6, 2) && can_store(block->height - 6, 2))
+    {
+        int A = block->width - 6;
+        int B = block->height - 6;
+
+        block_mode = (B << 9) | 256 | (A << 5) | (R << 2);
+    }
+
+    return block_mode;
+}
+
+int range_table[][3] =
+{
+    //2^ 3^ 5^
+    { 1, 0, 0 }, // 0..1
+    { 0, 1, 0 }, // 0..2
+    { 2, 0, 0 }, // 0..3
+
+    { 0, 0, 1 }, // 0..4
+    { 1, 1, 0 }, // 0..5
+    { 3, 0, 0 }, // 0..7
+
+    { 1, 0, 1 }, // 0..9
+    { 2, 1, 0 }, // 0..11
+    { 4, 0, 0 }, // 0..15
+
+    { 2, 0, 1 }, // 0..19
+    { 3, 1, 0 }, // 0..23
+    { 5, 0, 0 }, // 0..31
+
+    { 3, 0, 1 }, // 0..39
+    { 4, 1, 0 }, // 0..47
+    { 6, 0, 0 }, // 0..63
+
+    { 4, 0, 1 }, // 0..79
+    { 5, 1, 0 }, // 0..95
+    { 7, 0, 0 }, // 0..127
+
+    { 5, 0, 1 }, // 0..159
+    { 6, 1, 0 }, // 0..191
+    { 8, 0, 0 }, // 0..255
+};
+
+int get_levels(int range)
+{
+    return (1 + 2 * range_table[range][1] + 4 * range_table[range][2]) << range_table[range][0];
+}
+
+int sequence_bits(int count, int range)
+{
+    int bits = count * range_table[range][0];
+    bits += (count * range_table[range][1] * 8 + 4) / 5;
+    bits += (count * range_table[range][2] * 7 + 2) / 3;
+    return bits;
+}
+
+void set_bits(uint32_t data[4], int* pos, int bits, uint32_t value)
+{
+    assert(bits <= 25);
+    uint32_t word = *(uint32_t*)(((uint8_t*)data) + *pos / 8);
+
+    uint32_t mask = (1 << bits) - 1;
+    word |= value << (*pos % 8);
+
+    *(uint32_t*)(((uint8_t*)data) + *pos / 8) = word;
+    *pos += bits;
+}
+
+uint32_t get_field(uint32_t input, int a, int b)
+{
+    assert(a >= b);
+    return (input >> b) & ((1 << (a - b + 1)) - 1);
+}
+
+uint32_t get_bit(uint32_t input, int a)
+{
+    return get_field(input, a, a);
+}
+
+void pack_five_trits(uint32_t data[4], int sequence[5], int* pos, int n)
+{
+    int t[5];
+    int m[5];
+
+    for (int i = 0; i < 5; i++)
+    {
+        t[i] = sequence[i] >> n;
+        m[i] = sequence[i] - (t[i] << n);
+    }
+
+    int C;
+
+    if (t[1] == 2 && t[2] == 2)
+    {
+        C = 3 * 4 + t[0];
+    }
+    else if (t[2] == 2)
+    {
+        C = t[1] * 16 + t[0] * 4 + 3;
+    }
+    else
+    {
+        C = t[2] * 16 + t[1] * 4 + t[0];
+    }
+
+    int T;
+
+    if (t[3] == 2 && t[4] == 2)
+    {
+        T = get_field(C, 4, 2) * 32 + 7 * 4 + get_field(C, 1, 0);
+    }
+    else
+    {
+        T = get_field(C, 4, 0);
+        if (t[4] == 2)
+        {
+            T += t[3] * 128 + 3 * 32;
+        }
+        else
+        {
+            T += t[4] * 128 + t[3] * 32;
+        }
+    }
+
+    uint32_t pack1 = 0;
+    pack1 |= m[0];
+    pack1 |= get_field(T, 1, 0) << n;
+    pack1 |= m[1] << (2 + n);
+
+    uint32_t pack2 = 0; 
+    pack2 |= get_field(T, 3, 2);
+    pack2 |= m[2] << 2;
+    pack2 |= get_field(T, 4, 4) << (2 + n);
+    pack2 |= m[3] << (3 + n);
+    pack2 |= get_field(T, 6, 5) << (3 + n * 2);
+    pack2 |= m[4] << (5 + n * 2);
+    pack2 |= get_field(T, 7, 7) << (5 + n * 3);
+
+    set_bits(data, pos, 2 + n * 2, pack1);
+    set_bits(data, pos, 6 + n * 3, pack2);
+}
+
+void pack_three_quint(uint32_t data[4], int sequence[3], int* pos, int n)
+{
+    int q[3];
+    int m[3];
+
+    for (int i = 0; i < 3; i++)
+    {
+        q[i] = sequence[i] >> n;
+        m[i] = sequence[i] - (q[i] << n);
+    }
+
+    int Q;
+
+    if (q[0] == 4 && q[1] == 4)
+    {
+        Q = get_field(q[2], 1, 0) * 8 + 3 * 2 + get_bit(q[2], 2);
+    }
+    else
+    {
+        int C;
+        if (q[1] == 4)
+        {
+            C = (q[0] << 3) + 5;
+        }
+        else
+        {
+            C = (q[1] << 3) + q[0];
+        }
+
+        if (q[2] == 4)
+        {
+            Q = get_field(~C, 2, 1) * 32 + get_field(C, 4, 3) * 8 + 3 * 2 + get_bit(C, 0);
+        }
+        else
+        {
+            Q = q[2] * 32 + get_field(C, 4, 0);
+        }
+    }
+
+    uint32_t pack = 0;
+    pack |= m[0];
+    pack |= get_field(Q, 2, 0) << n;
+    pack |= m[1] << (3 + n);
+    pack |= get_field(Q, 4, 3) << (3 + n * 2);
+    pack |= m[2] << (5 + n * 2);
+    pack |= get_field(Q, 6, 5) << (5 + n * 3);
+
+    set_bits(data, pos, 7 + n * 3, pack);
+}
+
+void pack_integer_sequence(uint32_t output_data[4], uint8_t sequence[], int pos, int count, int range)
+{
+    int n = range_table[range][0];
+    int bits = sequence_bits(count, range);
+    int pos0 = pos;
+
+    uint32_t data[5] = { 0 };
+    if (range_table[range][1] == 1)
+    {
+        for (int j = 0; j < (count + 4) / 5; j++)
+        {
+            int temp[5] = { 0 };
+            for (int i = 0; i < std::min(count - j * 5, 5); i++) temp[i] = sequence[j * 5 + i];
+            pack_five_trits(data, temp, &pos, n);
+        }
+    }
+    else if (range_table[range][2] == 1)
+    {
+        for (int j = 0; j < (count + 2) / 3; j++)
+        {
+            int temp[3] = { 0 };
+            for (int i = 0; i < std::min(count - j * 3, 3); i++) temp[i] = sequence[j * 3 + i];
+            pack_three_quint(data, temp, &pos, n);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < count; i++)
+        {
+            set_bits(data, &pos, n, sequence[i]);
+        }
+    }
+
+    if (pos0 + bits < 96) data[3] = 0;
+    if (pos0 + bits < 64) data[2] = 0;
+    if (pos0 + bits < 32) data[1] = 0;
+    data[(pos0 + bits) / 32] &= (1 << ((pos0 + bits) % 32)) - 1;
+
+    for (int k = 0; k < 4; k++) output_data[k] |= data[k];
+}
+
+uint32_t reverse_bits_32(uint32_t input)
+{
+    uint32_t t = input;
+    t = (t << 16) | (t >> 16);
+    t = ((t & 0x00FF00FF) << 8) | ((t & 0xFF00FF00) >> 8);
+    t = ((t & 0x0F0F0F0F) << 4) | ((t & 0xF0F0F0F0) >> 4);
+    t = ((t & 0x33333333) << 2) | ((t & 0xCCCCCCCC) >> 2);
+    t = ((t & 0x55555555) << 1) | ((t & 0xAAAAAAAA) >> 1);
+
+    return t;
+}
+
+void pack_block(uint32_t data[4], astc_block* block)
+{
+    memset(data, 0, 16);
+
+    int pos = 0;
+    set_bits(data, &pos, 11, pack_block_mode(block));
+
+    int num_weights = block->width * block->height * (block->dual_plane ? 2 : 1);
+    int weight_bits = sequence_bits(num_weights, block->weight_range);
+    int extra_bits = 0;
+
+    assert(num_weights <= 64);
+    assert(24 <= weight_bits && weight_bits <= 96);
+
+    set_bits(data, &pos, 2, block->partitions - 1);
+    if (block->partitions > 1)
+    {
+        set_bits(data, &pos, 10, block->partition_id);
+
+        int min_cem = 16;
+        int max_cem = 0;
+        for (int j = 0; j < block->partitions; j++)
+        {
+            min_cem = std::min(min_cem, block->color_endpoint_modes[j]);
+            max_cem = std::max(max_cem, block->color_endpoint_modes[j]);
+        }
+        assert(max_cem / 4 <= min_cem / 4 + 1);
+
+        int CEM = block->color_endpoint_modes[0] << 2;
+        if (max_cem != min_cem)
+        {
+            CEM = std::min(3, min_cem / 4 + 1);
+            for (int j = 0; j < block->partitions; j++)
+            {
+                int c = block->color_endpoint_modes[j] / 4 - ((CEM & 3) - 1);
+                int m = block->color_endpoint_modes[j] % 4;
+                assert(c == 0 || c == 1);
+                CEM |= c << (2 + j);
+                CEM |= m << (2 + block->partitions + 2 * j);
+            }
+            extra_bits = 3 * block->partitions - 4;
+            int pos2 = 128 - weight_bits - extra_bits;
+            set_bits(data, &pos2, extra_bits, CEM >> 6);
+        }
+        
+        set_bits(data, &pos, 6, CEM & 63);
+    }
+    else
+    {
+        set_bits(data, &pos, 4, block->color_endpoint_modes[0]);
+    }
+    
+    if (block->dual_plane)
+    {
+        assert(block->partitions < 4);
+        extra_bits += 2;
+        int pos2 = 128 - weight_bits - extra_bits;
+        set_bits(data, &pos2, 2, block->color_component_selector);
+    }
+
+    int config_bits = pos + extra_bits;
+    int remaining_bits = 128 - config_bits - weight_bits;
+
+    int num_cem_pairs = 0;
+    for (int j = 0; j < block->partitions; j++) num_cem_pairs += 1 + block->color_endpoint_modes[j] / 4;
+
+    assert(num_cem_pairs <= 9);
+
+    int endpoint_range = -1;
+    for (int range = 20; range>0; range--)
+    {
+        int bits = sequence_bits(2 * num_cem_pairs, range);
+        if (bits <= remaining_bits)
+        {
+            endpoint_range = range;
+            break;
+        }
+    }
+
+    assert(endpoint_range >= 4);
+    assert(block->endpoint_range == endpoint_range);
+
+    pack_integer_sequence(data, block->endpoints, pos, 2 * num_cem_pairs, endpoint_range);
+    
+    uint32_t rdata[4] = { 0, 0, 0, 0 };
+    pack_integer_sequence(rdata, block->weights, 0, num_weights, block->weight_range);
+
+    for (int i = 0; i < 4; i++) data[i] |= reverse_bits_32(rdata[3 - i]);    
+}
+
+void atsc_rank(const rgba_surface* src, int xx, int yy, uint32_t* mode_buffer, astc_enc_settings* settings)
+{
+    ispc::astc_rank_ispc((ispc::rgba_surface*)src, xx, yy, mode_buffer, (ispc::astc_enc_settings*)settings);
+}
+
+extern "C" void pack_block_c(uint32_t data[4], ispc::astc_block* block)
+{
+    assert(sizeof(ispc::astc_block) == sizeof(astc_block));
+    pack_block(data, (astc_block*)block);
+}
+
+void setup_list_context(ispc::astc_enc_context* ctx, uint32_t packed_mode)
+{
+    ctx->width = 2 + get_field(packed_mode, 15, 13); // 2..8 <= 2^3
+    ctx->height = 2 + get_field(packed_mode, 18, 16); // 2..8 <= 2^3
+    ctx->dual_plane = get_field(packed_mode, 19, 19); // 0 or 1
+    ctx->partitions = 1;
+    
+    int color_endpoint_modes0 = get_field(packed_mode, 7, 6) * 2 + 6; // 6, 8, 10 or 12
+    ctx->color_endpoint_pairs = 1 + (color_endpoint_modes0 / 4);
+
+    ctx->channels = (color_endpoint_modes0 > 8) ? 4 : 3;
+}
+
+void astc_encode(const rgba_surface* src, float* block_scores, uint8_t* dst, uint64_t* list, astc_enc_settings* settings)
+{
+    ispc::astc_enc_context list_context;
+    setup_list_context(&list_context, uint32_t(list[1] & 0xFFFFFFFF));
+
+    assert(sizeof(ispc::rgba_surface) == sizeof(rgba_surface));
+    assert(sizeof(ispc::astc_enc_settings) == sizeof(astc_enc_settings));
+    ispc::astc_encode_ispc((ispc::rgba_surface*)src, block_scores, dst, list, &list_context, (ispc::astc_enc_settings*)settings);
+}
+
+void CompressBlocksASTC(const rgba_surface* src, uint8_t* dst, astc_enc_settings* settings)
+{
+    assert(src->height % settings->block_height == 0);
+    assert(src->width % settings->block_width == 0);
+    
+    assert(settings->block_height <= 8);
+    assert(settings->block_width <= 8);
+    
+    int tex_width = src->width / settings->block_width;
+    int programCount = ispc::get_programCount();
+
+    std::vector<float> block_scores(tex_width * src->height / settings->block_height);
+
+    for (int yy = 0; yy < src->height / settings->block_height; yy++)
+    for (int xx = 0; xx < tex_width; xx++)
+    {
+        block_scores[yy * tex_width + xx] = std::numeric_limits<float>::infinity();
+    }
+
+    int mode_list_size = 3334;
+    int list_size = programCount;
+    std::vector<uint64_t> mode_lists(list_size * mode_list_size);
+    std::vector<uint32_t> mode_buffer(programCount * settings->fastSkipTreshold);
+
+    for (int yy = 0; yy < src->height / settings->block_height; yy++)
+    for (int _x = 0; _x < (tex_width + programCount - 1) / programCount; _x++)
+    {
+        int xx = _x * programCount;
+        atsc_rank(src, xx, yy, mode_buffer.data(), settings);
+        
+        for (int i = 0; i < settings->fastSkipTreshold; i++)
+        for (int k = 0; k < programCount; k++)
+        {
+            if (xx + k >= tex_width) continue;
+                
+            uint32_t offset = (yy << 16) + (xx + k);
+            uint32_t mode = mode_buffer[programCount * i + k];
+            int mode_bin = mode >> 20;
+            uint64_t* mode_list = &mode_lists[list_size * mode_bin];
+
+            if (*mode_list < programCount - 1)
+            {
+                int index = int(mode_list[0] + 1);
+                mode_list[0] = index;
+
+                mode_list[index] = (uint64_t(offset) << 32) + mode;
+            }
+            else
+            {
+                mode_list[0] = (uint64_t(offset) << 32) + mode;
+
+                astc_encode(src, block_scores.data(), dst, mode_list, settings);
+                memset(mode_list, 0, list_size * sizeof(uint64_t));
+            }                
+        }
+    }
+
+    for (int mode_bin = 0; mode_bin < mode_list_size; mode_bin++)
+    {
+        uint64_t* mode_list = &mode_lists[list_size * mode_bin];
+        if (mode_list[0] == 0) continue;
+        mode_list[0] = 0;
+
+        astc_encode(src, block_scores.data(), dst, mode_list, settings);
+        memset(mode_list, 0, list_size * sizeof(uint64_t));
+    }
+}
diff --git a/Source/ispc_texcomp/kernel.ispc b/Source/ispc_texcomp/kernel.ispc
new file mode 100644
index 0000000..9a284b5
--- /dev/null
+++ b/Source/ispc_texcomp/kernel.ispc
@@ -0,0 +1,3798 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016, Intel Corporation
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 
+// documentation files (the "Software"), to deal in the Software without restriction, including without limitation 
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of 
+// the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+// SOFTWARE.
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef ISPC_UINT_IS_DEFINED
+//these are defined in ISPC version 1.13.0 and later
+typedef unsigned int8 uint8;
+typedef unsigned int32 uint32;
+typedef unsigned int64 uint64;
+#endif
+
+///////////////////////////
+//   generic helpers
+
+inline float RCP(float x)
+{
+    return 1.0f/x; // uses rcp when compiled with --opt=fast-math
+    //return rcp(x);
+    //return rcp_fast(x);
+}
+
+inline float RSQRT(float x)
+{
+    return 1.0f/sqrt(x); // uses rsqrt when compiled with --opt=fast-math
+    //return rsqrt(x);
+    //return rsqrt_fast(x);
+}    
+
+inline void swap_ints(int u[], int v[], uniform int n)
+{
+    for (uniform int i=0; i<n; i++)
+	{
+		int t = u[i];
+		u[i] = v[i];
+		v[i] = t;
+	}
+}
+
+inline void swap_uints(uint32 u[], uint32 v[], uniform int n)
+{
+    for (uniform int i=0; i<n; i++)
+	{
+		uint32 t = u[i];
+		u[i] = v[i];
+		v[i] = t;
+	}
+}
+
+inline float sq(float v)
+{
+	return v*v;
+}
+
+inline int pow2(int x) 
+{
+	return 1<<x; 
+}
+
+inline float clamp(float v, int a, int b)
+{
+    return clamp(v, (float)a, (float)b);
+}
+
+// the following helpers isolate performance warnings
+
+inline unsigned int32 gather_uint(const uniform unsigned int32* const uniform ptr, int idx)
+{
+	return ptr[idx]; // (perf warning expected)
+}
+
+inline unsigned int32 gather_uint(const varying unsigned int32* const uniform ptr, int idx)
+{
+	return ptr[idx]; // (perf warning expected)
+}
+
+inline int32 gather_int(const uniform int32* const uniform ptr, int idx)
+{
+	return ptr[idx]; // (perf warning expected)
+}
+
+inline float gather_float(varying float* uniform ptr, int idx)
+{
+	return ptr[idx]; // (perf warning expected)
+}
+
+inline void scatter_uint(uniform unsigned int32* ptr, int idx, uint32 value)
+{
+	ptr[idx] = value; // (perf warning expected)
+}
+
+inline void scatter_int(varying int32* uniform ptr, int idx, uint32 value)
+{
+	ptr[idx] = value; // (perf warning expected)
+}
+
+inline uint32 shift_right(uint32 v, const uniform int bits)
+{
+	return v>>bits; // (perf warning expected)
+}
+
+///////////////////////////////////////////////////////////
+//				    BC1/BC7 shared
+
+struct rgba_surface
+{
+	uint8* ptr;
+	int width, height, stride;
+};
+
+inline void load_block_interleaved(float block[48], uniform rgba_surface* uniform src, int xx, uniform int yy)
+{
+    for (uniform int y = 0; y<4; y++)
+    for (uniform int x = 0; x<4; x++)
+    {
+        uniform unsigned int32* uniform src_ptr = (unsigned int32*)&src->ptr[(yy * 4 + y)*src->stride];
+        unsigned int32 rgba = gather_uint(src_ptr, xx * 4 + x);
+
+        block[16 * 0 + y * 4 + x] = (int)((rgba >> 0) & 255);
+        block[16 * 1 + y * 4 + x] = (int)((rgba >> 8) & 255);
+        block[16 * 2 + y * 4 + x] = (int)((rgba >> 16) & 255);
+    }
+}
+
+inline void load_block_interleaved_rgba(float block[64], uniform rgba_surface* uniform src, int xx, uniform int yy)
+{
+	for (uniform int y=0; y<4; y++)
+	for (uniform int x=0; x<4; x++)
+	{
+		uniform unsigned int32* uniform src_ptr = (unsigned int32*)&src->ptr[(yy*4+y)*src->stride];
+		unsigned int32 rgba = gather_uint(src_ptr, xx*4+x);
+
+		block[16*0+y*4+x] = (int)((rgba>> 0)&255);
+		block[16*1+y*4+x] = (int)((rgba>> 8)&255);
+		block[16*2+y*4+x] = (int)((rgba>>16)&255);
+		block[16*3+y*4+x] = (int)((rgba>>24)&255);
+	}
+}
+
+inline void load_block_interleaved_16bit(float block[48], uniform rgba_surface* uniform src, int xx, uniform int yy)
+{
+    for (uniform int y = 0; y<4; y++)
+    for (uniform int x = 0; x<4; x++)
+    {
+        uniform unsigned int32* uniform src_ptr_r = (unsigned int32*)&src->ptr[(yy * 4 + y)*src->stride + 0];
+        uniform unsigned int32* uniform src_ptr_g = (unsigned int32*)&src->ptr[(yy * 4 + y)*src->stride + 2];
+        uniform unsigned int32* uniform src_ptr_b = (unsigned int32*)&src->ptr[(yy * 4 + y)*src->stride + 4];
+        unsigned int32 xr = gather_uint(src_ptr_r, (xx * 4 + x) * 2);
+        unsigned int32 xg = gather_uint(src_ptr_g, (xx * 4 + x) * 2);
+        unsigned int32 xb = gather_uint(src_ptr_b, (xx * 4 + x) * 2);
+
+        block[16 * 0 + y * 4 + x] = (int)(xr & 0xFFFF);
+        block[16 * 1 + y * 4 + x] = (int)(xg & 0xFFFF);
+        block[16 * 2 + y * 4 + x] = (int)(xb & 0xFFFF);
+        block[16 * 3 + y * 4 + x] = 0;
+    }
+}
+
+inline void load_block_r_8bit(float block[16], uniform rgba_surface* uniform src, int xx, uniform int yy)
+{
+	for (uniform int y=0; y<4; y++)
+	{
+		uniform unsigned int32* uniform src_ptr = (unsigned int32*)&src->ptr[(yy*4+y)*src->stride];
+		unsigned int32 rrrr = gather_uint(src_ptr, xx);
+
+		block[y*4+0] = (int)((rrrr>> 0)&255);
+		block[y*4+1] = (int)((rrrr>> 8)&255);
+		block[y*4+2] = (int)((rrrr>>16)&255);
+		block[y*4+3] = (int)((rrrr>>24)&255);
+	}
+}
+
+inline void load_block_interleaved_rg_8bit(float block[32], uniform rgba_surface* uniform src, int xx, uniform int yy)
+{
+	for (uniform int y=0; y<4; y++)
+	{
+		uniform unsigned int32* uniform src_ptr = (unsigned int32*)&src->ptr[(yy*4+y)*src->stride];
+		unsigned int32 rgrg0 = gather_uint(src_ptr, xx * 2 + 0);
+        unsigned int32 rgrg1 = gather_uint(src_ptr, xx * 2 + 1);
+
+        // r
+		block[16*0+y*4+0] = (int)((rgrg0>> 0)&255);
+		block[16*0+y*4+1] = (int)((rgrg0>>16)&255);
+		block[16*0+y*4+2] = (int)((rgrg1>> 0)&255);
+		block[16*0+y*4+3] = (int)((rgrg1>>16)&255);
+        // g
+		block[16*1+y*4+0] = (int)((rgrg0>> 8)&255);
+		block[16*1+y*4+1] = (int)((rgrg0>>24)&255);
+		block[16*1+y*4+2] = (int)((rgrg1>> 8)&255);
+		block[16*1+y*4+3] = (int)((rgrg1>>24)&255);
+	}
+}
+
+inline void store_data(uniform uint8 dst[], int width, int xx, uniform int yy, uint32 data[], int data_size)
+{
+	for (uniform int k=0; k<data_size; k++)
+	{
+		uniform uint32* dst_ptr = (uint32*)&dst[(yy)*width*data_size];
+		scatter_uint(dst_ptr, xx*data_size+k, data[k]);
+	}
+}
+
+inline void ssymv(float a[3], float covar[6], float b[3])
+{
+	a[0] = covar[0]*b[0]+covar[1]*b[1]+covar[2]*b[2];
+	a[1] = covar[1]*b[0]+covar[3]*b[1]+covar[4]*b[2];
+	a[2] = covar[2]*b[0]+covar[4]*b[1]+covar[5]*b[2];
+}
+
+inline void ssymv3(float a[4], float covar[10], float b[4])
+{
+	a[0] = covar[0]*b[0]+covar[1]*b[1]+covar[2]*b[2];
+	a[1] = covar[1]*b[0]+covar[4]*b[1]+covar[5]*b[2];
+	a[2] = covar[2]*b[0]+covar[5]*b[1]+covar[7]*b[2];
+}
+
+inline void ssymv4(float a[4], float covar[10], float b[4])
+{
+	a[0] = covar[0]*b[0]+covar[1]*b[1]+covar[2]*b[2]+covar[3]*b[3];
+	a[1] = covar[1]*b[0]+covar[4]*b[1]+covar[5]*b[2]+covar[6]*b[3];
+	a[2] = covar[2]*b[0]+covar[5]*b[1]+covar[7]*b[2]+covar[8]*b[3];
+	a[3] = covar[3]*b[0]+covar[6]*b[1]+covar[8]*b[2]+covar[9]*b[3];
+}
+
+inline void compute_axis3(float axis[3], float covar[6], uniform const int powerIterations)
+{
+	float vec[3] = {1,1,1};
+
+    for (uniform int i=0; i<powerIterations; i++)
+	{
+		ssymv(axis, covar, vec);
+		for (uniform int p=0; p<3; p++) vec[p] = axis[p];
+
+		if (i%2==1) // renormalize every other iteration
+		{
+			float norm_sq = 0;
+			for (uniform int p=0; p<3; p++)
+				norm_sq += axis[p]*axis[p];
+
+			float rnorm = RSQRT(norm_sq);
+			for (uniform int p=0; p<3; p++) vec[p] *= rnorm;
+		}		
+	}
+
+	for (uniform int p=0; p<3; p++) axis[p] = vec[p];
+}
+
+inline void compute_axis(float axis[4], float covar[10], uniform const int powerIterations, uniform int channels)
+{
+	float vec[4] = {1,1,1,1};
+
+    for (uniform int i=0; i<powerIterations; i++)
+	{
+		if (channels == 3) ssymv3(axis, covar, vec);
+        if (channels == 4) ssymv4(axis, covar, vec);
+		for (uniform int p=0; p<channels; p++) vec[p] = axis[p];
+
+		if (i%2==1) // renormalize every other iteration
+		{
+			float norm_sq = 0;
+			for (uniform int p=0; p<channels; p++)
+				norm_sq += axis[p]*axis[p];
+
+			float rnorm = RSQRT(norm_sq);
+			for (uniform int p=0; p<channels; p++) vec[p] *= rnorm;
+		}		
+	}
+
+	for (uniform int p=0; p<channels; p++) axis[p] = vec[p];
+}
+
+///////////////////////////////////////////////////////////
+//					 BC1/BC3 encoding
+
+inline int stb__Mul8Bit(int a, int b)
+{
+  int t = a*b + 128;
+  return (t + (t >> 8)) >> 8;
+}
+
+inline unsigned int16 stb__As16Bit(int r, int g, int b)
+{
+   return (stb__Mul8Bit(r,31) << 11) + (stb__Mul8Bit(g,63) << 5) + stb__Mul8Bit(b,31);
+}
+
+inline unsigned int16 enc_rgb565(float c[3])
+{
+	return stb__As16Bit((int)c[0], (int)c[1], (int)c[2]);
+}
+
+inline void dec_rgb565(float c[3], int p)
+{
+	int c2 = (p>>0)&31;
+	int c1 = (p>>5)&63;
+	int c0 = (p>>11)&31;
+
+	c[0] = (c0<<3)+(c0>>2);
+	c[1] = (c1<<2)+(c1>>4);
+	c[2] = (c2<<3)+(c2>>2);
+}
+
+inline void pick_endpoints_dc(int c0[3], int c1[3], int block[48], int iaxis[3])
+{
+	for (uniform int p=0; p<3; p++)
+	for (uniform int y=0; y<4; y++)
+	for (uniform int x=0; x<4; x++)
+	{
+		c0[p] += block[p*16+y*4+x];
+	}
+
+	for (uniform int p=0; p<3; p++)
+		c0[p] >>= 4;
+}
+
+inline void pick_endpoints(float c0[3], float c1[3], float block[48], float axis[3], float dc[3])
+{
+	float min_dot = 256*256;
+	float max_dot = 0;
+
+	for (uniform int y=0; y<4; y++)
+	for (uniform int x=0; x<4; x++)
+	{
+		float dot = 0;
+		for (uniform int p=0; p<3; p++)
+			dot += (block[p*16+y*4+x]-dc[p])*axis[p];
+	
+		min_dot = min(min_dot, dot);
+		max_dot = max(max_dot, dot);
+	}
+
+	if (max_dot-min_dot < 1.0f)
+	{
+		min_dot -= 0.5f;
+		max_dot += 0.5f;
+	}
+
+	float norm_sq = 0;
+	for (uniform int p=0; p<3; p++)
+		norm_sq += axis[p]*axis[p];
+
+	float rnorm_sq = RCP(norm_sq);
+	for (uniform int p=0; p<3; p++)
+	{
+		c0[p] = clamp(dc[p]+min_dot*rnorm_sq*axis[p], 0, 255);
+		c1[p] = clamp(dc[p]+max_dot*rnorm_sq*axis[p], 0, 255);
+	}
+}
+
+inline uint32 fast_quant(float block[48], int p0, int p1)
+{
+	float c0[3];
+	float c1[3];
+	dec_rgb565(c0, p0);
+	dec_rgb565(c1, p1);
+
+	float dir[3];
+    for (uniform int p=0; p<3; p++) dir[p] = c1[p]-c0[p];
+    
+	float sq_norm = 0;
+	for (uniform int p=0; p<3; p++) sq_norm += sq(dir[p]);
+
+	float rsq_norm = RCP(sq_norm);
+
+	for (uniform int p=0; p<3; p++) dir[p] *= rsq_norm*3;
+
+	float bias = 0.5;
+	for (uniform int p=0; p<3; p++) bias -= c0[p]*dir[p];
+
+    uint32 bits = 0;    
+	uint32 scaler = 1;
+    for (uniform int k=0; k<16; k++)
+    {
+		float dot = 0;
+        for (uniform int p=0; p<3; p++)
+			dot += block[k+p*16]*dir[p];
+
+		int q = clamp((int)(dot+bias), 0, 3);
+
+		//bits += q<<(k*2);
+		bits += q*scaler;
+		scaler *= 4;
+    }
+	
+    return bits;
+}
+
+inline void compute_covar_dc(float covar[6], float dc[3], float block[48])
+{
+	for (uniform int i=0; i<6; i++) covar[i] = 0;
+	for (uniform int p=0; p<3; p++) dc[p] = 0;
+
+	for (uniform int k=0; k<16; k++)
+	{
+		for (uniform int p=0; p<3; p++)
+			dc[p] += block[k+p*16];
+	}
+
+	for (uniform int p=0; p<3; p++) dc[p] /= 16;
+	
+	for (uniform int k=0; k<16; k++)
+	{
+		float rgb[3];
+		for (uniform int p=0; p<3; p++)
+			rgb[p] = block[k+p*16]-dc[p];
+		
+		covar[0] += rgb[0]*rgb[0];
+		covar[1] += rgb[0]*rgb[1];
+		covar[2] += rgb[0]*rgb[2];
+		
+		covar[3] += rgb[1]*rgb[1];
+		covar[4] += rgb[1]*rgb[2];
+
+		covar[5] += rgb[2]*rgb[2];
+	}
+}
+
+// ugly, but makes BC1 compression 20% faster overall
+inline void compute_covar_dc_ugly(float covar[6], float dc[3], float block[48])
+{
+	for (uniform int p=0; p<3; p++)
+	{
+		float acc = 0;
+		for (uniform int k=0; k<16; k++)
+			acc += block[k+p*16];
+		dc[p] = acc/16;
+	}
+	
+	float covar0 = 0.0f;
+	float covar1 = 0.0f;
+	float covar2 = 0.0f;
+	float covar3 = 0.0f;
+	float covar4 = 0.0f;
+	float covar5 = 0.0f;
+
+	for (uniform int k=0; k<16; k++)
+	{
+		float rgb0, rgb1, rgb2;
+		rgb0 = block[k+0*16]-dc[0];
+		rgb1 = block[k+1*16]-dc[1];
+		rgb2 = block[k+2*16]-dc[2];
+		
+		covar0 += rgb0*rgb0;
+		covar1 += rgb0*rgb1;
+		covar2 += rgb0*rgb2;
+		
+		covar3 += rgb1*rgb1;
+		covar4 += rgb1*rgb2;
+
+		covar5 += rgb2*rgb2;
+	}
+
+	covar[0] = covar0;
+	covar[1] = covar1;
+	covar[2] = covar2;
+	covar[3] = covar3;
+	covar[4] = covar4;
+	covar[5] = covar5;
+}
+
+inline void bc1_refine(int pe[2], float block[48], unsigned int32 bits, float dc[3])
+{
+	float c0[3];
+	float c1[3];
+
+	if ((bits ^ (bits*4)) < 4)
+    {
+        // single color
+        for (uniform int p=0; p<3; p++)
+        {
+            c0[p] = dc[p];
+            c1[p] = dc[p];
+        }
+    }
+    else
+	{
+		float Atb1[3] = {0,0,0};
+		float sum_q = 0;
+		float sum_qq = 0;
+		unsigned int32 shifted_bits = bits;
+               
+        for (uniform int k=0; k<16; k++)
+        {
+            float q = (int)(shifted_bits&3);
+			shifted_bits >>= 2;
+
+            float x = 3-q;
+            float y = q;
+            
+			sum_q += q;
+			sum_qq += q*q;
+
+            for (uniform int p=0; p<3; p++) Atb1[p] += x*block[k+p*16];
+        }
+        
+		float sum[3];
+		float Atb2[3];
+
+		for (uniform int p=0; p<3; p++) 
+		{
+			sum[p] = dc[p]*16;
+		    Atb2[p] = 3*sum[p]-Atb1[p];
+		}
+        
+	    float Cxx = 16*sq(3)-2*3*sum_q+sum_qq;
+	    float Cyy = sum_qq;
+		float Cxy = 3*sum_q-sum_qq;
+		float scale = 3.0f * RCP(Cxx*Cyy - Cxy*Cxy);
+
+        for (uniform int p=0; p<3; p++)
+        {
+            c0[p] = (Atb1[p]*Cyy - Atb2[p]*Cxy)*scale;
+            c1[p] = (Atb2[p]*Cxx - Atb1[p]*Cxy)*scale;
+			
+			c0[p] = clamp(c0[p], 0, 255);
+			c1[p] = clamp(c1[p], 0, 255);
+        }
+    }
+
+	pe[0] = enc_rgb565(c0);
+    pe[1] = enc_rgb565(c1);
+}
+
+inline uint32 fix_qbits(uint32 qbits)
+{
+	uniform const uint32 mask_01b = 0x55555555;
+	uniform const uint32 mask_10b = 0xAAAAAAAA;
+
+	uint32 qbits0 = qbits&mask_01b;
+	uint32 qbits1 = qbits&mask_10b;
+	qbits = (qbits1>>1) + (qbits1 ^ (qbits0<<1));
+
+	return qbits;
+}
+
+inline void CompressBlockBC1_core(float block[48], uint32 data[2])
+{
+	uniform const int powerIterations = 4;
+    uniform const int refineIterations = 1;
+    
+	float covar[6];
+	float dc[3];
+	compute_covar_dc_ugly(covar, dc, block);
+	
+	float eps = 0.001;
+	covar[0] += eps;
+	covar[3] += eps;
+	covar[5] += eps;
+	
+	float axis[3];
+	compute_axis3(axis, covar, powerIterations);
+		
+    float c0[3];
+    float c1[3];
+    pick_endpoints(c0, c1, block, axis, dc);
+	
+	int p[2];
+    p[0] = enc_rgb565(c0);
+    p[1] = enc_rgb565(c1);
+	if (p[0]<p[1]) swap_ints(&p[0], &p[1], 1);
+	
+	data[0] = (1<<16)*p[1]+p[0];
+	data[1] = fast_quant(block, p[0], p[1]);
+    	
+    // refine
+    for (uniform int i=0; i<refineIterations; i++)
+    {
+        bc1_refine(p, block, data[1], dc);
+		if (p[0]<p[1]) swap_ints(&p[0], &p[1], 1);
+        data[0] = (1<<16)*p[1]+p[0];
+		data[1] = fast_quant(block, p[0], p[1]);
+    }
+	
+	data[1] = fix_qbits(data[1]);
+}
+
+inline void CompressBlockBC3_alpha(float block[16], uint32 data[2])
+{
+    float ep[2] = { 255, 0 };
+	
+    for (uniform int k=0; k<16; k++)
+	{
+		ep[0] = min(ep[0], block[k]);
+		ep[1] = max(ep[1], block[k]);
+	}
+    
+    if (ep[0] == ep[1]) ep[1] = ep[0]+0.1f;
+	    
+    uint32 qblock[2] = { 0, 0 };
+    float scale = 7.0f/(ep[1]-ep[0]);
+
+    for (uniform int k=0; k<16; k++)
+    {
+        float v = block[k];
+        float proj = (v-ep[0])*scale+0.5f;
+
+        int q = clamp((int)proj, 0, 7);
+
+		q = 7-q;
+
+        if (q > 0) q++;
+        if (q==8) q = 1;
+
+        qblock[k/8] |= q << ((k%8)*3);
+    }
+
+	// (could be improved by refinement)
+    
+    data[0] = clamp((int)ep[0], 0, 255)*256+clamp((int)ep[1], 0, 255);
+    data[0] |= qblock[0]<<16;
+    data[1] = qblock[0]>>16;
+    data[1] |= qblock[1]<<8;
+}
+
+inline void CompressBlockBC1(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[])
+{
+	float block[48];
+    uint32 data[2];
+
+	load_block_interleaved(block, src, xx, yy);
+	
+    CompressBlockBC1_core(block, data);
+
+	store_data(dst, src->width, xx, yy, data, 2);
+}
+
+inline void CompressBlockBC3(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[])
+{
+	float block[64];
+    uint32 data[4];
+
+	load_block_interleaved_rgba(block, src, xx, yy);
+	
+    CompressBlockBC3_alpha(&block[48], &data[0]);
+    CompressBlockBC1_core(block, &data[2]);
+
+	store_data(dst, src->width, xx, yy, data, 4);
+}
+
+inline void CompressBlockBC4(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[])
+{
+	float block[16];
+    uint32 data[2];
+
+	load_block_r_8bit(block, src, xx, yy);
+	
+    CompressBlockBC3_alpha(block, data);
+
+	store_data(dst, src->width, xx, yy, data, 2);
+}
+
+inline void CompressBlockBC5(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[])
+{
+	float block[32];
+    uint32 data[4];
+
+	load_block_interleaved_rg_8bit(block, src, xx, yy);
+	
+    CompressBlockBC3_alpha(block, data);
+    CompressBlockBC3_alpha(&block[16], &data[2]);
+
+	store_data(dst, src->width, xx, yy, data, 4);
+}
+
+export void CompressBlocksBC1_ispc(uniform rgba_surface src[], uniform uint8 dst[])
+{	
+	for (uniform int yy = 0; yy<src->height/4; yy++)
+	foreach (xx = 0 ... src->width/4)
+	{
+		CompressBlockBC1(src, xx, yy, dst);
+	}
+}
+
+export void CompressBlocksBC3_ispc(uniform rgba_surface src[], uniform uint8 dst[])
+{	
+	for (uniform int yy = 0; yy<src->height/4; yy++)
+	foreach (xx = 0 ... src->width/4)
+	{
+		CompressBlockBC3(src, xx, yy, dst);
+	}
+}
+
+export void CompressBlocksBC4_ispc(uniform rgba_surface src[], uniform uint8 dst[])
+{
+	for (uniform int yy = 0; yy<src->height/4; yy++)
+	foreach (xx = 0 ... src->width/4)
+	{
+		CompressBlockBC4(src, xx, yy, dst);
+	}
+}
+
+export void CompressBlocksBC5_ispc(uniform rgba_surface src[], uniform uint8 dst[])
+{
+	for (uniform int yy = 0; yy<src->height/4; yy++)
+	foreach (xx = 0 ... src->width/4)
+	{
+		CompressBlockBC5(src, xx, yy, dst);
+	}
+}
+
+///////////////////////////////////////////////////////////
+//					 BC7 encoding
+
+struct bc7_enc_settings
+{
+	bool mode_selection[4];
+	int refineIterations[8];
+
+    bool skip_mode2;
+	int fastSkipTreshold_mode1;
+	int fastSkipTreshold_mode3;
+	int fastSkipTreshold_mode7;
+
+    int mode45_channel0;
+	int refineIterations_channel;
+
+    int channels;
+};
+
+struct bc7_enc_state
+{
+	float block[64];
+
+    float opaque_err;       // error for coding alpha=255
+	float best_err;
+	uint32 best_data[5];	// 4, +1 margin for skips
+
+	// settings
+	uniform bool mode_selection[4];
+	uniform int refineIterations[8];
+
+    uniform bool skip_mode2;
+	uniform int fastSkipTreshold_mode1;
+	uniform int fastSkipTreshold_mode3;
+	uniform int fastSkipTreshold_mode7;
+
+    uniform int mode45_channel0;
+	uniform int refineIterations_channel;
+
+    uniform int channels;
+};
+
+struct mode45_parameters
+{
+	int qep[8];
+	uint32 qblock[2];
+	int aqep[2];
+	uint32 aqblock[2];
+	int rotation;
+	int swap;
+};
+
+void bc7_code_mode01237(uint32 data[5], int qep[6], uint32 qblock[2], int part_id, uniform int mode);
+void bc7_code_mode45(uint32 data[5], mode45_parameters params[], uniform int mode);
+void bc7_code_mode6(uint32 data[5], int qep[8], uint32 qblock[2]);
+
+///////////////////////////
+//   BC7 format data
+
+inline uniform const int* uniform get_unquant_table(uniform int bits)
+{
+    assert(bits>=2 && bits<=4); // invalid bit size
+
+    static uniform const int unquant_table_2bits[] = { 0, 21, 43, 64 };
+    static uniform const int unquant_table_3bits[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
+    static uniform const int unquant_table_4bits[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
+    
+	uniform const int* uniform unquant_tables[] = {unquant_table_2bits, unquant_table_3bits, unquant_table_4bits};
+
+    return unquant_tables[bits-2];
+}
+
+inline uint32 get_pattern(int part_id)
+{
+	static uniform const uint32 pattern_table[] = {
+        0x50505050u, 0x40404040u, 0x54545454u, 0x54505040u, 0x50404000u, 0x55545450u, 0x55545040u, 0x54504000u,
+		0x50400000u, 0x55555450u, 0x55544000u, 0x54400000u, 0x55555440u, 0x55550000u, 0x55555500u, 0x55000000u,
+		0x55150100u, 0x00004054u, 0x15010000u, 0x00405054u, 0x00004050u, 0x15050100u, 0x05010000u, 0x40505054u,
+		0x00404050u, 0x05010100u, 0x14141414u, 0x05141450u, 0x01155440u, 0x00555500u, 0x15014054u, 0x05414150u,
+		0x44444444u, 0x55005500u, 0x11441144u, 0x05055050u, 0x05500550u, 0x11114444u, 0x41144114u, 0x44111144u,
+		0x15055054u, 0x01055040u, 0x05041050u, 0x05455150u, 0x14414114u, 0x50050550u, 0x41411414u, 0x00141400u,
+		0x00041504u, 0x00105410u, 0x10541000u, 0x04150400u, 0x50410514u, 0x41051450u, 0x05415014u, 0x14054150u,
+		0x41050514u, 0x41505014u, 0x40011554u, 0x54150140u, 0x50505500u, 0x00555050u, 0x15151010u, 0x54540404u,
+		0xAA685050u, 0x6A5A5040u, 0x5A5A4200u, 0x5450A0A8u, 0xA5A50000u, 0xA0A05050u, 0x5555A0A0u, 0x5A5A5050u,
+		0xAA550000u, 0xAA555500u, 0xAAAA5500u, 0x90909090u, 0x94949494u, 0xA4A4A4A4u, 0xA9A59450u, 0x2A0A4250u,
+		0xA5945040u, 0x0A425054u, 0xA5A5A500u, 0x55A0A0A0u, 0xA8A85454u, 0x6A6A4040u, 0xA4A45000u, 0x1A1A0500u,
+		0x0050A4A4u, 0xAAA59090u, 0x14696914u, 0x69691400u, 0xA08585A0u, 0xAA821414u, 0x50A4A450u, 0x6A5A0200u,
+		0xA9A58000u, 0x5090A0A8u, 0xA8A09050u, 0x24242424u, 0x00AA5500u, 0x24924924u, 0x24499224u, 0x50A50A50u,
+		0x500AA550u, 0xAAAA4444u, 0x66660000u, 0xA5A0A5A0u, 0x50A050A0u, 0x69286928u, 0x44AAAA44u, 0x66666600u,
+		0xAA444444u, 0x54A854A8u, 0x95809580u, 0x96969600u, 0xA85454A8u, 0x80959580u, 0xAA141414u, 0x96960000u,
+		0xAAAA1414u, 0xA05050A0u, 0xA0A5A5A0u, 0x96000000u, 0x40804080u, 0xA9A8A9A8u, 0xAAAAAA44u, 0x2A4A5254u
+	};
+
+	return gather_uint(pattern_table, part_id);
+}
+
+inline int get_pattern_mask(int part_id, int j)
+{
+    static uniform const uint32 pattern_mask_table[] = {
+		0xCCCC3333u, 0x88887777u, 0xEEEE1111u, 0xECC81337u, 0xC880377Fu, 0xFEEC0113u, 0xFEC80137u, 0xEC80137Fu,
+		0xC80037FFu, 0xFFEC0013u, 0xFE80017Fu, 0xE80017FFu, 0xFFE80017u, 0xFF0000FFu, 0xFFF0000Fu, 0xF0000FFFu,
+		0xF71008EFu, 0x008EFF71u, 0x71008EFFu, 0x08CEF731u, 0x008CFF73u, 0x73108CEFu, 0x3100CEFFu, 0x8CCE7331u,
+		0x088CF773u, 0x3110CEEFu, 0x66669999u, 0x366CC993u, 0x17E8E817u, 0x0FF0F00Fu, 0x718E8E71u, 0x399CC663u,
+		0xAAAA5555u, 0xF0F00F0Fu, 0x5A5AA5A5u, 0x33CCCC33u, 0x3C3CC3C3u, 0x55AAAA55u, 0x96966969u, 0xA55A5AA5u,
+		0x73CE8C31u, 0x13C8EC37u, 0x324CCDB3u, 0x3BDCC423u, 0x69969669u, 0xC33C3CC3u, 0x99666699u, 0x0660F99Fu,
+		0x0272FD8Du, 0x04E4FB1Bu, 0x4E40B1BFu, 0x2720D8DFu, 0xC93636C9u, 0x936C6C93u, 0x39C6C639u, 0x639C9C63u,
+		0x93366CC9u, 0x9CC66339u, 0x817E7E81u, 0xE71818E7u, 0xCCF0330Fu, 0x0FCCF033u, 0x774488BBu, 0xEE2211DDu,
+		0x08CC0133u, 0x8CC80037u, 0xCC80006Fu, 0xEC001331u, 0x330000FFu, 0x00CC3333u, 0xFF000033u, 0xCCCC0033u,
+		0x0F0000FFu, 0x0FF0000Fu, 0x00F0000Fu, 0x44443333u, 0x66661111u, 0x22221111u, 0x136C0013u, 0x008C8C63u,
+		0x36C80137u, 0x08CEC631u, 0x3330000Fu, 0xF0000333u, 0x00EE1111u, 0x88880077u, 0x22C0113Fu, 0x443088CFu,
+		0x0C22F311u, 0x03440033u, 0x69969009u, 0x9960009Fu, 0x03303443u, 0x00660699u, 0xC22C3113u, 0x8C0000EFu,
+		0x1300007Fu, 0xC4003331u, 0x004C1333u, 0x22229999u, 0x00F0F00Fu, 0x24929249u, 0x29429429u, 0xC30C30C3u,
+		0xC03C3C03u, 0x00AA0055u, 0xAA0000FFu, 0x30300303u, 0xC0C03333u, 0x90900909u, 0xA00A5005u, 0xAAA0000Fu,
+		0x0AAA0555u, 0xE0E01111u, 0x70700707u, 0x6660000Fu, 0x0EE01111u, 0x07707007u, 0x06660999u, 0x660000FFu,
+		0x00660099u, 0x0CC03333u, 0x03303003u, 0x60000FFFu, 0x80807777u, 0x10100101u, 0x000A0005u, 0x08CE8421u
+	};
+
+	uint32 mask_packed = gather_uint(pattern_mask_table, part_id);
+	int mask0 = mask_packed&0xFFFF;
+	int mask1 = mask_packed>>16;
+
+	int mask = (j==2) ? (~mask0)&(~mask1) : ( (j==0) ? mask0 : mask1 );
+	return mask;
+}
+
+inline void get_skips(int skips[3], int part_id)
+{
+	static uniform const int skip_table[] = {
+        0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 
+        0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0x20u, 0x20u,
+        0xf0u, 0xf0u, 0x60u, 0x80u, 0x20u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0x60u, 
+        0x60u, 0x20u, 0x60u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u,
+        0x3fu, 0x38u, 0xf8u, 0xf3u, 0x8fu, 0x3fu, 0xf3u, 0xf8u, 0x8fu, 0x8fu, 0x6fu, 0x6fu, 0x6fu, 0x5fu, 0x3fu, 0x38u, 
+        0x3fu, 0x38u, 0x8fu, 0xf3u, 0x3fu, 0x38u, 0x6fu, 0xa8u, 0x53u, 0x8fu, 0x86u, 0x6au, 0x8fu, 0x5fu, 0xfau, 0xf8u,
+		0x8fu, 0xf3u, 0x3fu, 0x5au, 0x6au, 0xa8u, 0x89u, 0xfau, 0xf6u, 0x3fu, 0xf8u, 0x5fu, 0xf3u, 0xf6u, 0xf6u, 0xf8u, 
+        0x3fu, 0xf3u, 0x5fu, 0x5fu, 0x5fu, 0x8fu, 0x5fu, 0xafu, 0x5fu, 0xafu, 0x8fu, 0xdfu, 0xf3u, 0xcfu, 0x3fu, 0x38u
+	};
+
+	int skip_packed = gather_int(skip_table, part_id);
+	skips[0] = 0;
+	skips[1] = skip_packed>>4;
+	skips[2] = skip_packed&15;
+}
+
+///////////////////////////
+//      PCA helpers
+
+inline void compute_stats_masked(float stats[15], float block[64], int mask, uniform int channels)
+{
+	for (uniform int i=0; i<15; i++) stats[i] = 0;
+
+	int mask_shifted = mask<<1;
+	for (uniform int k=0; k<16; k++)
+	{
+		mask_shifted >>= 1;
+		//if ((mask_shifted&1) == 0) continue;
+		int flag = (mask_shifted&1);
+
+		float rgba[4];
+		for (uniform int p=0; p<channels; p++) rgba[p] = block[k+p*16];
+		
+		for (uniform int p=0; p<channels; p++) rgba[p] *= flag;
+		stats[14] += flag;
+
+		stats[10] += rgba[0];
+		stats[11] += rgba[1];
+		stats[12] += rgba[2];
+
+		stats[0] += rgba[0]*rgba[0];
+		stats[1] += rgba[0]*rgba[1];
+		stats[2] += rgba[0]*rgba[2];
+
+		stats[4] += rgba[1]*rgba[1];
+		stats[5] += rgba[1]*rgba[2];
+
+		stats[7] += rgba[2]*rgba[2];
+
+        if (channels==4)
+        {
+		    stats[13] += rgba[3];
+
+    		stats[3] += rgba[0]*rgba[3];
+	    	stats[6] += rgba[1]*rgba[3];
+		    stats[8] += rgba[2]*rgba[3];
+		    stats[9] += rgba[3]*rgba[3];
+        }
+	}
+}
+
+inline void covar_from_stats(float covar[10], float stats[15], uniform int channels)
+{
+	covar[0] = stats[0] - stats[10+0]*stats[10+0]/stats[14];
+	covar[1] = stats[1] - stats[10+0]*stats[10+1]/stats[14];
+	covar[2] = stats[2] - stats[10+0]*stats[10+2]/stats[14];
+
+	covar[4] = stats[4] - stats[10+1]*stats[10+1]/stats[14];
+	covar[5] = stats[5] - stats[10+1]*stats[10+2]/stats[14];
+
+	covar[7] = stats[7] - stats[10+2]*stats[10+2]/stats[14];
+
+    if (channels == 4)
+    {
+        covar[3] = stats[3] - stats[10+0]*stats[10+3]/stats[14];
+	    covar[6] = stats[6] - stats[10+1]*stats[10+3]/stats[14];
+	    covar[8] = stats[8] - stats[10+2]*stats[10+3]/stats[14];
+	    covar[9] = stats[9] - stats[10+3]*stats[10+3]/stats[14];
+    }
+}
+
+inline void compute_covar_dc_masked(float covar[6], float dc[3], float block[64], int mask, uniform int channels)
+{
+	float stats[15];
+	compute_stats_masked(stats, block, mask, channels);
+
+	covar_from_stats(covar, stats, channels);
+	for (uniform int p=0; p<channels; p++) dc[p] = stats[10+p]/stats[14];
+}
+
+void block_pca_axis(float axis[4], float dc[4], float block[64], int mask, uniform int channels)
+{
+	uniform const int powerIterations = 8; // 4 not enough for HQ
+
+    float covar[10];
+	compute_covar_dc_masked(covar, dc, block, mask, channels);
+
+    //float var = covar[0] + covar[4] + covar[7] + covar[9] + 256;
+    float inv_var = 1.0 / (256 * 256);
+    for (uniform int k = 0; k < 10; k++)
+    {
+        covar[k] *= inv_var;
+    }
+
+    float eps = sq(0.001);
+    covar[0] += eps;
+	covar[4] += eps;
+	covar[7] += eps;
+	covar[9] += eps;
+
+	compute_axis(axis, covar, powerIterations, channels);
+}
+
+void block_segment_core(float ep[], float block[64], int mask, uniform int channels)
+{
+	float axis[4];
+	float dc[4];
+	block_pca_axis(axis, dc, block, mask, channels);
+	
+	float ext[2];
+	ext[0] = +1e99;
+	ext[1] = -1e99;
+
+	// find min/max
+	int mask_shifted = mask<<1;
+	for (uniform int k=0; k<16; k++)
+	{
+		mask_shifted >>= 1;
+		if ((mask_shifted&1) == 0) continue;
+
+		float dot = 0;
+		for (uniform int p=0; p<channels; p++)
+			dot += axis[p]*(block[16*p+k]-dc[p]);
+
+		ext[0] = min(ext[0], dot);
+        ext[1] = max(ext[1], dot);
+	}
+
+	// create some distance if the endpoints collapse
+	if (ext[1]-ext[0] < 1.0f)
+	{
+		ext[0] -= 0.5f;
+		ext[1] += 0.5f;
+	}
+
+    for (uniform int i=0; i<2; i++)
+	for (uniform int p=0; p<channels; p++)
+	{
+        ep[4*i+p] = ext[i]*axis[p]+dc[p];
+    }
+}
+
+void block_segment(float ep[], float block[64], int mask, uniform int channels)
+{
+    block_segment_core(ep, block, mask, channels);
+
+	for (uniform int i=0; i<2; i++)
+	for (uniform int p=0; p<channels; p++)
+	{
+		ep[4*i+p] = clamp(ep[4*i+p], 0, 255);
+	}
+}
+
+float get_pca_bound(float covar[10], uniform int channels)
+{
+    uniform const int powerIterations = 4; // quite approximative, but enough for bounding
+
+    float inv_var = 1.0 / (256 * 256);
+    for (uniform int k = 0; k < 10; k++)
+    {
+        covar[k] *= inv_var;
+    }
+
+	float eps = sq(0.001);
+	covar[0] += eps;
+	covar[4] += eps;
+	covar[7] += eps;
+
+	float axis[4];
+	compute_axis(axis, covar, powerIterations, channels);
+
+	float vec[4];
+    if (channels == 3) ssymv3(vec, covar, axis);
+    if (channels == 4) ssymv4(vec, covar, axis);
+
+	float sq_sum = 0.0f;
+	for (uniform int p=0; p<channels; p++) sq_sum += sq(vec[p]);
+	float lambda = sqrt(sq_sum);
+
+	float bound = covar[0]+covar[4]+covar[7];
+    if (channels == 4) bound += covar[9];
+	bound -= lambda;
+	bound = max(bound, 0.0);
+
+	return bound;
+}
+
+float block_pca_bound(float block[64], int mask, uniform int channels)
+{
+	float stats[15];
+	compute_stats_masked(stats, block, mask, channels);
+
+	float covar[10];
+	covar_from_stats(covar, stats, channels);
+
+	return get_pca_bound(covar, channels);
+}
+
+float block_pca_bound_split(float block[64], int mask, float full_stats[15], uniform int channels)
+{
+    float stats[15];
+	compute_stats_masked(stats, block, mask, channels);
+    
+	float covar1[10];
+	covar_from_stats(covar1, stats, channels);
+	
+	for (uniform int i=0; i<15; i++)
+		stats[i] = full_stats[i] - stats[i];
+
+	float covar2[10];
+	covar_from_stats(covar2, stats, channels);
+
+	float bound = 0.0f;
+	bound += get_pca_bound(covar1, channels);
+	bound += get_pca_bound(covar2, channels);
+
+	return sqrt(bound)*256;
+}
+
+///////////////////////////
+// endpoint quantization
+
+inline int unpack_to_byte(int v, uniform const int bits)
+{
+	assert(bits >= 4);
+	int vv = v<<(8-bits);
+	return vv + shift_right(vv, bits);
+}
+
+void ep_quant0367(int qep[], float ep[], uniform int mode, uniform int channels)
+{
+	uniform int bits = 7;
+    if (mode == 0) bits = 4;
+    if (mode == 7) bits = 5;
+
+	uniform int levels = 1 << bits;
+	uniform int levels2 = levels*2-1;
+        
+    for (uniform int i=0; i<2; i++)
+	{
+	    int qep_b[8];
+    
+		for (uniform int b=0; b<2; b++)
+		for (uniform int p=0; p<4; p++)
+		{
+			int v = (int)((ep[i*4+p]/255.0f*levels2-b)/2+0.5)*2+b;
+			qep_b[b*4+p] = clamp(v, b, levels2-1+b);
+		}
+
+		float ep_b[8];
+		for (uniform int j=0; j<8; j++)
+			ep_b[j] = qep_b[j];
+
+		if (mode==0)
+		for (uniform int j=0; j<8; j++)
+			ep_b[j] = unpack_to_byte(qep_b[j], 5);
+    
+        float err0 = 0.0f;
+        float err1 = 0.0f;
+        for (uniform int p=0; p<channels; p++)
+        {
+            err0 += sq(ep[i*4+p]-ep_b[0+p]);
+            err1 += sq(ep[i*4+p]-ep_b[4+p]);
+        }
+
+		for (uniform int p=0; p<4; p++)
+			qep[i*4+p] = (err0<err1) ? qep_b[0+p] : qep_b[4+p];
+    }
+}
+
+void ep_quant1(int qep[], float ep[], uniform int mode)
+{
+	int qep_b[16];
+        
+    for (uniform int b=0; b<2; b++)
+	for (uniform int i=0; i<8; i++)
+    {
+        int v = ((int)((ep[i]/255.0f*127.0f-b)/2+0.5))*2+b;
+		qep_b[b*8+i] = clamp(v, b, 126+b);
+    }
+    
+	// dequant
+	float ep_b[16];
+	for (uniform int k=0; k<16; k++)
+        ep_b[k] = unpack_to_byte(qep_b[k], 7);
+
+	float err0 = 0.0f;
+    float err1 = 0.0f;
+    for (uniform int j = 0; j < 2; j++)
+    for (uniform int p = 0; p < 3; p++)
+    {
+        err0 += sq(ep[j * 4 + p] - ep_b[0 + j * 4 + p]);
+        err1 += sq(ep[j * 4 + p] - ep_b[8 + j * 4 + p]);
+    }
+
+	for (uniform int i=0; i<8; i++)
+		qep[i] = (err0<err1) ? qep_b[0+i] : qep_b[8+i];
+
+}
+
+void ep_quant245(int qep[], float ep[], uniform int mode)
+{
+	uniform int bits = 5;
+    if (mode == 5) bits = 7;
+    uniform int levels = 1 << bits;
+        
+	for (uniform int i=0; i<8; i++)
+	{
+		int v = ((int)(ep[i]/255.0f*(levels-1)+0.5));
+		qep[i] = clamp(v, 0, levels-1);
+	}
+}
+
+void ep_quant(int qep[], float ep[], uniform int mode, uniform int channels)
+{
+	assert(mode <= 7);
+	static uniform const int pairs_table[] = {3,2,3,2,1,1,1,2};
+	uniform const int pairs = pairs_table[mode];
+
+	if (mode == 0 || mode == 3 || mode == 6 || mode == 7)
+	{
+		for (uniform int i=0; i<pairs; i++)
+			ep_quant0367(&qep[i*8], &ep[i*8], mode, channels);
+	}
+	else if (mode == 1)
+	{
+		for (uniform int i=0; i<pairs; i++)
+			ep_quant1(&qep[i*8], &ep[i*8], mode);
+	}
+	else if (mode == 2 || mode == 4 || mode == 5)
+	{
+		for (uniform int i=0; i<pairs; i++)
+			ep_quant245(&qep[i*8], &ep[i*8], mode);
+	}
+	else 
+		assert(false);
+
+}
+
+void ep_dequant(float ep[], int qep[], uniform int mode)
+{
+    assert(mode <= 7);
+	static uniform const int pairs_table[] = {3,2,3,2,1,1,1,2};
+	uniform const int pairs = pairs_table[mode];
+    
+	// mode 3, 6 are 8-bit
+	if (mode == 3 || mode == 6)
+    {
+	    for (uniform int i=0; i<8*pairs; i++)
+		    ep[i] = qep[i];
+    }
+    else if (mode == 1 || mode == 5)
+    {
+	    for (uniform int i=0; i<8*pairs; i++)
+            ep[i] = unpack_to_byte(qep[i], 7);
+    }
+    else if (mode == 0 || mode == 2 || mode == 4)
+    {
+	    for (uniform int i=0; i<8*pairs; i++)
+            ep[i] = unpack_to_byte(qep[i], 5);
+    }
+    else if (mode == 7)
+	{
+        for (uniform int i=0; i<8*pairs; i++)
+            ep[i] = unpack_to_byte(qep[i], 6);
+    }
+    else 
+		assert(false);
+}
+
+void ep_quant_dequant(int qep[], float ep[], uniform int mode, uniform int channels)
+{
+	ep_quant(qep, ep, mode, channels);
+	ep_dequant(ep, qep, mode);
+}
+
+///////////////////////////
+//   pixel quantization
+
+float block_quant(uint32 qblock[2], float block[64], uniform int bits, float ep[], uint32 pattern, uniform int channels)
+{
+	float total_err = 0;
+	uniform const int* uniform unquant_table = get_unquant_table(bits);
+    int levels = 1 << bits;
+
+	// 64-bit qblock: 5% overhead in this function
+	for (uniform int k=0; k<2; k++) qblock[k] = 0;
+
+	int pattern_shifted = pattern;
+	for (uniform int k=0; k<16; k++)
+	{
+		int j = pattern_shifted&3;
+		pattern_shifted >>= 2;
+
+		float proj = 0;
+		float div = 0;
+		for (uniform int p=0; p<channels; p++)
+        {
+			float ep_a = gather_float(ep, 8*j+0+p);
+			float ep_b = gather_float(ep, 8*j+4+p);
+            proj += (block[k+p*16]-ep_a)*(ep_b-ep_a);
+            div += sq(ep_b-ep_a);
+        }
+        
+        proj /= div;
+        		
+		int q1 = (int)(proj*levels+0.5);
+		q1 = clamp(q1, 1, levels-1);
+		
+		float err0 = 0;
+		float err1 = 0;
+		int w0 = gather_int(unquant_table, q1-1);
+		int w1 = gather_int(unquant_table, q1);
+
+		for (uniform int p=0; p<channels; p++)
+		{
+			float ep_a = gather_float(ep, 8*j+0+p);
+			float ep_b = gather_float(ep, 8*j+4+p);
+			float dec_v0 = (int)(((64-w0)*ep_a + w0*ep_b + 32)/64);
+			float dec_v1 = (int)(((64-w1)*ep_a + w1*ep_b + 32)/64);
+			err0 += sq(dec_v0 - block[k+p*16]);
+			err1 += sq(dec_v1 - block[k+p*16]);
+		}
+		
+		int best_err = err1;
+		int best_q = q1;
+		if (err0<err1)
+		{
+			best_err = err0;
+			best_q = q1-1;
+		}
+
+		assert(best_q>=0 && best_q<=levels-1);
+
+		qblock[k/8] += ((uint32)best_q) << 4*(k%8);
+		total_err += best_err;
+    }
+
+	return total_err;
+}
+
+///////////////////////////
+// LS endpoint refinement
+
+void opt_endpoints(float ep[], float block[64], uniform int bits, uint32 qblock[2], int mask, uniform int channels)
+{
+	uniform int levels = 1 << bits;
+    
+	float Atb1[4] = {0,0,0,0};
+	float sum_q = 0;
+	float sum_qq = 0;
+	float sum[5] = {0,0,0,0,0};
+                
+	int mask_shifted = mask<<1;
+	for (uniform int k1=0; k1<2; k1++)
+	{
+		uint32 qbits_shifted = qblock[k1];
+		for (uniform int k2=0; k2<8; k2++)
+		{
+			uniform int k = k1*8+k2;
+			float q = (int)(qbits_shifted&15);
+			qbits_shifted >>= 4;
+
+			mask_shifted >>= 1;
+			if ((mask_shifted&1) == 0) continue;
+		
+			int x = (levels-1)-q;
+			int y = q;
+            
+			sum_q += q;
+			sum_qq += q*q;
+
+			sum[4] += 1;
+			for (uniform int p=0; p<channels; p++) sum[p] += block[k+p*16];
+			for (uniform int p=0; p<channels; p++) Atb1[p] += x*block[k+p*16];
+		}
+	}
+        
+	float Atb2[4];
+	for (uniform int p=0; p<channels; p++) 
+	{
+		//sum[p] = dc[p]*16;
+		Atb2[p] = (levels-1)*sum[p]-Atb1[p];
+	}
+        
+	float Cxx = sum[4]*sq(levels-1)-2*(levels-1)*sum_q+sum_qq;
+	float Cyy = sum_qq;
+	float Cxy = (levels-1)*sum_q-sum_qq;
+	float scale = (levels-1) / (Cxx*Cyy - Cxy*Cxy);
+
+    for (uniform int p=0; p<channels; p++)
+    {
+        ep[0+p] = (Atb1[p]*Cyy - Atb2[p]*Cxy)*scale;
+        ep[4+p] = (Atb2[p]*Cxx - Atb1[p]*Cxy)*scale;
+			
+		//ep[0+p] = clamp(ep[0+p], 0, 255);
+		//ep[4+p] = clamp(ep[4+p], 0, 255);
+    }
+
+	if (abs(Cxx*Cyy - Cxy*Cxy) < 0.001)
+	{
+		// flatten
+		for (uniform int p=0; p<channels; p++)
+		{
+			ep[0+p] = sum[p]/sum[4];
+			ep[4+p] = ep[0+p];
+		}
+	}
+}
+
+//////////////////////////
+// parameter estimation
+
+float compute_opaque_err(float block[64], uniform int channels)
+{
+    if (channels == 3) return 0;
+    float err = 0.0f;
+    for (uniform int k=0; k<16; k++)
+    {
+        err += sq(block[48+k]-255);
+    }
+
+    return err;
+}
+
+float bc7_enc_mode01237_part_fast(int qep[24], uint32 qblock[2], float block[64], int part_id, uniform int mode)
+{
+	uint32 pattern = get_pattern(part_id);
+	uniform int bits = 2;  if (mode == 0 || mode == 1) bits = 3;
+    uniform int pairs = 2; if (mode == 0 || mode == 2) pairs = 3;
+    uniform int channels = 3; if (mode == 7) channels = 4;
+
+	float ep[24];
+	for (uniform int j=0; j<pairs; j++)
+	{
+		int mask = get_pattern_mask(part_id, j);
+		block_segment(&ep[j*8], block, mask, channels);
+	}
+
+	ep_quant_dequant(qep, ep, mode, channels);
+
+	float total_err = block_quant(qblock, block, bits, ep, pattern, channels);
+	return total_err;
+}
+
+void bc7_enc_mode01237(bc7_enc_state state[], uniform int mode, int part_list[], uniform int part_count)
+{
+	if (part_count == 0) return;
+	uniform int bits = 2;  if (mode == 0 || mode == 1) bits = 3;
+    uniform int pairs = 2; if (mode == 0 || mode == 2) pairs = 3;
+    uniform int channels = 3; if (mode == 7) channels = 4;
+
+	int best_qep[24];
+	uint32 best_qblock[2];
+	int best_part_id = -1;
+	float best_err = 1e99;
+
+	for (uniform int part=0; part<part_count; part++)
+	{
+		int part_id = part_list[part]&63;
+        if (pairs == 3) part_id += 64;
+
+		int qep[24];
+		uint32 qblock[2];
+		float err = bc7_enc_mode01237_part_fast(qep, qblock, state->block, part_id, mode);
+        
+		if (err<best_err)
+		{
+			for (uniform int i=0; i<8*pairs; i++) best_qep[i] = qep[i];
+			for (uniform int k=0; k<2; k++) best_qblock[k] = qblock[k];
+			best_part_id = part_id;
+			best_err = err;
+		}
+	}
+    
+	// refine
+    uniform int refineIterations = state->refineIterations[mode];
+	for (uniform int _=0; _<refineIterations; _++)
+	{
+		float ep[24];
+		for (uniform int j=0; j<pairs; j++)
+		{
+			int mask = get_pattern_mask(best_part_id, j);
+			opt_endpoints(&ep[j*8], state->block, bits, best_qblock, mask, channels);
+		}
+
+		int qep[24];
+		uint32 qblock[2];
+
+		ep_quant_dequant(qep, ep, mode, channels);
+		
+		uint32 pattern = get_pattern(best_part_id);
+		float err = block_quant(qblock, state->block, bits, ep, pattern, channels);
+
+		if (err<best_err)
+		{
+			for (uniform int i=0; i<8*pairs; i++) best_qep[i] = qep[i];
+			for (uniform int k=0; k<2; k++) best_qblock[k] = qblock[k];
+			best_err = err;
+		}
+	}
+    
+	if (mode != 7) best_err += state->opaque_err; // take into account alpha channel
+
+	if (best_err<state->best_err)
+    {
+        state->best_err = best_err;
+        bc7_code_mode01237(state->best_data, best_qep, best_qblock, best_part_id, mode);
+    }
+}
+
+void partial_sort_list(int list[], uniform int length, uniform int partial_count)
+{
+	for (uniform int k=0; k<partial_count; k++)
+	{
+		int best_idx = k;
+		int best_value = list[k];
+		for (uniform int i=k+1; i<length; i++)
+		{
+			if (best_value > list[i])
+			{
+				best_value = list[i];
+				best_idx = i;
+			}
+		}
+
+		// swap
+		scatter_int(list, best_idx, list[k]);
+		list[k] = best_value;
+	}
+}
+
+void bc7_enc_mode02(bc7_enc_state state[])
+{
+	int part_list[64];
+	for (uniform int part=0; part<64; part++)
+		part_list[part] = part;
+
+	bc7_enc_mode01237(state, 0, part_list, 16); 
+	if (!state->skip_mode2) bc7_enc_mode01237(state, 2, part_list, 64); // usually not worth the time
+}
+
+void bc7_enc_mode13(bc7_enc_state state[])
+{
+	if (state->fastSkipTreshold_mode1 == 0 && state->fastSkipTreshold_mode3 == 0) return;
+
+	float full_stats[15];
+	compute_stats_masked(full_stats, state->block, -1, 3);
+
+	int part_list[64];
+	for (uniform int part=0; part<64; part++)
+	{
+		int mask = get_pattern_mask(part+0, 0);
+		float bound12 = block_pca_bound_split(state->block, mask, full_stats, 3);
+		int bound = (int)(bound12);
+		part_list[part] = part+bound*64;
+	}
+
+	partial_sort_list(part_list, 64, max(state->fastSkipTreshold_mode1, state->fastSkipTreshold_mode3));
+	bc7_enc_mode01237(state, 1, part_list, state->fastSkipTreshold_mode1);
+	bc7_enc_mode01237(state, 3, part_list, state->fastSkipTreshold_mode3);
+}
+
+void bc7_enc_mode7(bc7_enc_state state[])
+{
+    if (state->fastSkipTreshold_mode7 == 0) return;
+
+	float full_stats[15];
+	compute_stats_masked(full_stats, state->block, -1, state->channels);
+
+	int part_list[64];
+	for (uniform int part=0; part<64; part++)
+	{
+		int mask = get_pattern_mask(part+0, 0);
+		float bound12 = block_pca_bound_split(state->block, mask, full_stats, state->channels);
+		int bound = (int)(bound12);
+		part_list[part] = part+bound*64;
+	}
+
+	partial_sort_list(part_list, 64, state->fastSkipTreshold_mode7);
+	bc7_enc_mode01237(state, 7, part_list, state->fastSkipTreshold_mode7);
+}
+
+void channel_quant_dequant(int qep[2], float ep[2], uniform int epbits)
+{
+	int elevels = (1<<epbits);
+
+	for (uniform int i=0; i<2; i++)
+	{
+		int v = ((int)(ep[i]/255.0f*(elevels-1)+0.5));
+		qep[i] = clamp(v, 0, elevels-1);
+		ep[i] = unpack_to_byte(qep[i], epbits);
+	}
+}
+
+void channel_opt_endpoints(float ep[2], float block[16], uniform int bits, uint32 qblock[2])
+{
+	uniform int levels = 1 << bits;
+
+	float Atb1 = 0;
+	float sum_q = 0;
+	float sum_qq = 0;
+	float sum = 0;
+                
+	for (uniform int k1=0; k1<2; k1++)
+	{
+		uint32 qbits_shifted = qblock[k1];
+		for (uniform int k2=0; k2<8; k2++)
+		{
+			uniform int k = k1*8+k2;
+			float q = (int)(qbits_shifted&15);
+			qbits_shifted >>= 4;
+
+			int x = (levels-1)-q;
+			int y = q;
+            
+			sum_q += q;
+			sum_qq += q*q;
+
+			sum += block[k];
+			Atb1 += x*block[k];
+		}
+	}
+        
+	float Atb2 = (levels-1)*sum-Atb1;
+        
+	float Cxx = 16*sq(levels-1)-2*(levels-1)*sum_q+sum_qq;
+	float Cyy = sum_qq;
+	float Cxy = (levels-1)*sum_q-sum_qq;
+	float scale = (levels-1) / (Cxx*Cyy - Cxy*Cxy);
+
+    ep[0] = (Atb1*Cyy - Atb2*Cxy)*scale;
+    ep[1] = (Atb2*Cxx - Atb1*Cxy)*scale;
+			
+	ep[0] = clamp(ep[0], 0, 255);
+	ep[1] = clamp(ep[1], 0, 255);
+
+	if (abs(Cxx*Cyy - Cxy*Cxy) < 0.001)
+	{
+		ep[0] = sum/16;
+		ep[1] = ep[0];
+	}
+}
+
+float channel_opt_quant(uint32 qblock[2], float block[16], uniform int bits, float ep[])
+{
+	uniform const int* uniform unquant_table = get_unquant_table(bits);
+	int levels = (1<<bits);
+
+	qblock[0] = 0;
+	qblock[1] = 0;
+
+	float total_err = 0;
+
+	for (uniform int k=0; k<16; k++)
+	{
+		float proj = (block[k]-ep[0])/(ep[1]-ep[0]+0.001f);
+
+		int q1 = (int)(proj*levels+0.5);
+		q1 = clamp(q1, 1, levels-1);
+
+		float err0 = 0;
+		float err1 = 0;
+		int w0 = gather_int(unquant_table, q1-1);
+		int w1 = gather_int(unquant_table, q1);
+		
+		float dec_v0 = (int)(((64-w0)*ep[0] + w0*ep[1] + 32)/64);
+		float dec_v1 = (int)(((64-w1)*ep[0] + w1*ep[1] + 32)/64);
+		err0 += sq(dec_v0 - block[k]);
+		err1 += sq(dec_v1 - block[k]);
+
+		int best_err = err1;
+		int best_q = q1;
+		if (err0<err1)
+		{
+			best_err = err0;
+			best_q = q1-1;
+		}
+
+		qblock[k/8] += ((uint32)best_q) << 4*(k%8);
+		total_err += best_err;
+	}
+
+	return total_err;
+}
+
+float opt_channel(bc7_enc_state state[], uint32 qblock[2], int qep[2], float block[16], uniform int bits, uniform int epbits)
+{
+	float ep[2] = {255,0};
+
+	for (uniform int k=0; k<16; k++)
+	{
+		ep[0] = min(ep[0], block[k]);
+		ep[1] = max(ep[1], block[k]);
+	}
+
+	channel_quant_dequant(qep, ep, epbits);
+	float err = channel_opt_quant(qblock, block, bits, ep);
+		
+	// refine
+	uniform const int refineIterations = state->refineIterations_channel;
+    for (uniform int i=0; i<refineIterations; i++)
+	{
+		channel_opt_endpoints(ep, block, bits, qblock);
+		channel_quant_dequant(qep, ep, epbits);
+		err = channel_opt_quant(qblock, block, bits, ep);
+	}
+
+	return err;
+}
+
+void bc7_enc_mode45_candidate(bc7_enc_state state[], mode45_parameters best_candidate[], 
+	float best_err[], uniform int mode, uniform int rotation, uniform int swap)
+{
+	uniform int bits = 2; 
+    uniform int abits = 2;   if (mode==4) abits = 3;
+	uniform int aepbits = 8; if (mode==4) aepbits = 6;
+	if (swap==1) { bits = 3; abits = 2; } // (mode 4)
+
+	float block[48];
+	for (uniform int k=0; k<16; k++)
+	{
+		for (uniform int p=0; p<3; p++)
+			block[k+p*16] = state->block[k+p*16];
+
+		if (rotation < 3)
+		{
+			// apply channel rotation
+			if (state->channels == 4) block[k+rotation*16] = state->block[k+3*16];
+			if (state->channels == 3) block[k+rotation*16] = 255;
+		}
+	}
+	
+	float ep[8];
+	block_segment(ep, block, -1, 3);
+
+	int qep[8];
+	ep_quant_dequant(qep, ep, mode, 3);
+
+	uint32 qblock[2];
+	float err = block_quant(qblock, block, bits, ep, 0, 3);
+	
+	// refine
+    uniform int refineIterations = state->refineIterations[mode];
+	for (uniform int i=0; i<refineIterations; i++)
+    {
+        opt_endpoints(ep, block, bits, qblock, -1, 3);
+        ep_quant_dequant(qep, ep, mode, 3);
+		err = block_quant(qblock, block, bits, ep, 0, 3);
+    }
+
+	// encoding selected channel 
+	int aqep[2];
+	uint32 aqblock[2];
+	err += opt_channel(state, aqblock, aqep, &state->block[rotation*16], abits, aepbits);
+
+	if (err<*best_err)
+	{
+		
+		swap_ints(best_candidate->qep, qep, 8);
+		swap_uints(best_candidate->qblock, qblock, 2);
+		swap_ints(best_candidate->aqep, aqep, 2);
+		swap_uints(best_candidate->aqblock, aqblock, 2);
+		best_candidate->rotation = rotation;
+		best_candidate->swap = swap;
+		*best_err = err;
+	}	
+}
+
+void bc7_enc_mode45(bc7_enc_state state[])
+{
+	mode45_parameters best_candidate;
+	float best_err = state->best_err;
+
+	memset(&best_candidate, 0, sizeof(mode45_parameters));
+
+    uniform int channel0 = state->mode45_channel0;
+	for (uniform int p=channel0; p<state->channels; p++)
+	{
+    	bc7_enc_mode45_candidate(state, &best_candidate, &best_err, 4, p, 0);
+		bc7_enc_mode45_candidate(state, &best_candidate, &best_err, 4, p, 1);
+	}
+
+	// mode 4
+	if (best_err<state->best_err)
+    {
+        state->best_err = best_err;
+        bc7_code_mode45(state->best_data, &best_candidate, 4);
+    }
+    
+    for (uniform int p=channel0; p<state->channels; p++)
+	{
+		bc7_enc_mode45_candidate(state, &best_candidate, &best_err, 5, p, 0);
+	}
+
+	// mode 5
+	if (best_err<state->best_err)
+    {
+        state->best_err = best_err;
+        bc7_code_mode45(state->best_data, &best_candidate, 5);
+    }
+}
+
+void bc7_enc_mode6(bc7_enc_state state[])
+{
+	uniform int mode = 6;
+	uniform int bits = 4;
+	float ep[8];
+    block_segment(ep, state->block, -1, state->channels);
+    
+	if (state->channels == 3)
+	{
+		ep[3] = ep[7] = 255;
+	}
+
+	int qep[8];
+	ep_quant_dequant(qep, ep, mode, state->channels);
+
+	uint32 qblock[2];
+	float err = block_quant(qblock, state->block, bits, ep, 0, state->channels);
+
+	// refine
+	uniform int refineIterations = state->refineIterations[mode];
+    for (uniform int i=0; i<refineIterations; i++)
+    {
+        opt_endpoints(ep, state->block, bits, qblock, -1, state->channels);
+        ep_quant_dequant(qep, ep, mode, state->channels);
+		err = block_quant(qblock, state->block, bits, ep, 0, state->channels);
+    }
+        
+    if (err<state->best_err)
+    {
+        state->best_err = err;
+        bc7_code_mode6(state->best_data, qep, qblock);
+    }
+}
+
+//////////////////////////
+// BC7 bitstream coding
+
+void bc7_code_apply_swap_mode456(int qep[], uniform int channels, uint32 qblock[2], uniform int bits)
+{
+	uniform int levels = 1 << bits;
+	if ((qblock[0]&15)>=levels/2)
+    {
+		swap_ints(&qep[0], &qep[channels], channels);
+            
+		for (uniform int k=0; k<2; k++)
+			qblock[k] = (uint32)(0x11111111*(levels-1)) - qblock[k];
+    }
+
+	assert((qblock[0]&15) < levels/2);
+}
+
+int bc7_code_apply_swap_mode01237(int qep[], uint32 qblock[2], uniform int mode, int part_id)
+{
+	uniform int bits = 2;  if (mode == 0 || mode == 1) bits = 3;
+    uniform int pairs = 2; if (mode == 0 || mode == 2) pairs = 3;
+
+	int flips = 0;
+	uniform int levels = 1 << bits;
+	int skips[3];
+	get_skips(skips, part_id);
+
+	for (uniform int j=0; j<pairs; j++)
+	{
+		int k0 = skips[j];
+		//int q = (qblock[k0/8]>>((k0%8)*4))&15;
+		int q = ((gather_uint(qblock, k0>>3)<<(28-(k0&7)*4))>>28);
+		
+		if (q>=levels/2)
+		{
+			swap_ints(&qep[8*j], &qep[8*j+4], 4);
+			uint32 pmask = get_pattern_mask(part_id, j);
+			flips |= pmask;
+		}
+    }
+
+	return flips;
+}
+
+void put_bits(uint32 data[5], uniform int* uniform pos, uniform int bits, int v)
+{
+	assert(v<pow2(bits));
+	data[*pos/32] |= ((uint32)v) << (*pos%32);
+	if (*pos%32+bits>32)
+	{
+		data[*pos/32+1] |= shift_right(v, 32-*pos%32);
+	}
+	*pos += bits;
+}
+
+inline void data_shl_1bit_from(uint32 data[5], int from)
+{
+	if (from < 96)
+	{
+		assert(from > 64+10);
+
+		uint32 shifted = (data[2]>>1) | (data[3]<<31); 
+		uint32 mask = (pow2(from-64)-1)>>1;
+		data[2] = (mask&data[2]) | (~mask&shifted);
+		data[3] = (data[3]>>1) | (data[4]<<31);
+		data[4] = data[4]>>1;
+	}
+	else if (from < 128)
+	{
+		uint32 shifted = (data[3]>>1) | (data[4]<<31); 
+		uint32 mask = (pow2(from-96)-1)>>1;
+		data[3] = (mask&data[3]) | (~mask&shifted);
+		data[4] = data[4]>>1;
+	}
+}
+
+void bc7_code_qblock(uint32 data[5], uniform int* uniform pPos, uint32 qblock[2], uniform int bits, int flips)
+{
+	uniform int levels = 1 << bits;
+	int flips_shifted = flips;
+	for (uniform int k1=0; k1<2; k1++)
+	{
+		uint32 qbits_shifted = qblock[k1];
+		for (uniform int k2=0; k2<8; k2++)
+		{
+			int q = qbits_shifted&15;
+			if ((flips_shifted&1)>0) q = (levels-1)-q;
+
+			if (k1==0 && k2==0)	put_bits(data, pPos, bits-1, q);
+			else				put_bits(data, pPos, bits  , q);
+			qbits_shifted >>= 4;
+			flips_shifted >>= 1;
+		}
+	}
+}
+
+void bc7_code_adjust_skip_mode01237(uint32 data[5], uniform int mode, int part_id)
+{
+	uniform int bits = 2;  if (mode == 0 || mode == 1) bits = 3;
+    uniform int pairs = 2; if (mode == 0 || mode == 2) pairs = 3;
+		
+	int skips[3];
+	get_skips(skips, part_id);
+
+	if (pairs>2 && skips[1] < skips[2])
+	{
+		int t = skips[1]; skips[1] = skips[2]; skips[2] = t;
+	}
+
+	for (uniform int j=1; j<pairs; j++)
+	{
+		int k = skips[j];
+		data_shl_1bit_from(data, 128+(pairs-1)-(15-k)*bits);
+	}
+}
+
+void bc7_code_mode01237(uint32 data[5], int qep[], uint32 qblock[2], int part_id, uniform int mode)
+{
+	uniform int bits = 2;  if (mode == 0 || mode == 1) bits = 3;
+    uniform int pairs = 2; if (mode == 0 || mode == 2) pairs = 3;
+    uniform int channels = 3; if (mode == 7) channels = 4;
+
+    int flips = bc7_code_apply_swap_mode01237(qep, qblock, mode, part_id);
+
+	for (uniform int k=0; k<5; k++) data[k] = 0;
+    uniform int pos = 0;
+
+    // mode 0-3, 7
+    put_bits(data, &pos, mode+1, 1<<mode);
+    
+    // partition
+    if (mode==0)
+    {
+        put_bits(data, &pos, 4, part_id&15);
+    }
+    else
+    {
+        put_bits(data, &pos, 6, part_id&63);
+    }
+    
+    // endpoints
+    for (uniform int p=0; p<channels; p++)
+	for (uniform int j=0; j<pairs*2; j++)
+    {
+        if (mode == 0)
+        {
+            put_bits(data, &pos, 4, qep[j*4+0+p]>>1);
+        }
+        else if (mode == 1)
+        {
+            put_bits(data, &pos, 6, qep[j*4+0+p]>>1);
+        }
+        else if (mode == 2)
+        {
+            put_bits(data, &pos, 5, qep[j*4+0+p]);
+        }
+        else if (mode == 3)
+        {
+			put_bits(data, &pos, 7, qep[j*4+0+p]>>1);
+        }
+        else if (mode == 7)
+        {
+            put_bits(data, &pos, 5, qep[j*4+0+p]>>1);
+        }
+        else
+        {
+            assert(false);
+        }
+    }
+    
+    // p bits
+    if (mode == 1)
+	for (uniform int j=0; j<2; j++)
+    {
+        put_bits(data, &pos, 1, qep[j*8]&1);
+    }
+    
+    if (mode == 0 || mode == 3 || mode == 7)
+    for (uniform int j=0; j<pairs*2; j++)
+    {
+        put_bits(data, &pos, 1, qep[j*4]&1);
+    }
+	
+    // quantized values
+    bc7_code_qblock(data, &pos, qblock, bits, flips);
+	bc7_code_adjust_skip_mode01237(data, mode, part_id);
+}
+
+void bc7_code_mode45(uint32 data[5], varying mode45_parameters* uniform params, uniform int mode)
+{
+	int qep[8];
+	uint32 qblock[2];
+	int aqep[2];
+	uint32 aqblock[2];
+
+	swap_ints(params->qep, qep, 8);
+	swap_uints(params->qblock, qblock, 2);
+	swap_ints(params->aqep, aqep, 2);
+	swap_uints(params->aqblock, aqblock, 2);
+	int rotation = params->rotation;
+	int swap = params->swap;	
+	
+	uniform int bits = 2; 
+    uniform int abits = 2;   if (mode==4) abits = 3;
+    uniform int epbits = 7;  if (mode==4) epbits = 5;
+    uniform int aepbits = 8; if (mode==4) aepbits = 6;
+
+	if (!swap)
+	{
+		bc7_code_apply_swap_mode456(qep, 4, qblock, bits);
+		bc7_code_apply_swap_mode456(aqep, 1, aqblock, abits);
+	}
+	else
+	{
+		swap_uints(qblock, aqblock, 2);
+		bc7_code_apply_swap_mode456(aqep, 1, qblock, bits);
+		bc7_code_apply_swap_mode456(qep, 4, aqblock, abits);
+	}
+
+	for (uniform int k=0; k<5; k++) data[k] = 0;
+    uniform int pos = 0;
+	
+    // mode 4-5
+	put_bits(data, &pos, mode+1, 1<<mode);
+	
+	// rotation
+	//put_bits(data, &pos, 2, (rotation+1)%4);
+	put_bits(data, &pos, 2, (rotation+1)&3);
+    
+    if (mode==4)
+    {
+        put_bits(data, &pos, 1, swap);
+    }
+    
+    // endpoints
+    for (uniform int p=0; p<3; p++)
+    {
+        put_bits(data, &pos, epbits, qep[0+p]);
+        put_bits(data, &pos, epbits, qep[4+p]);
+    }
+    
+    // alpha endpoints
+    put_bits(data, &pos, aepbits, aqep[0]);
+    put_bits(data, &pos, aepbits, aqep[1]);
+        
+    // quantized values
+    bc7_code_qblock(data, &pos, qblock, bits, 0);
+    bc7_code_qblock(data, &pos, aqblock, abits, 0);
+}
+
+void bc7_code_mode6(uint32 data[5], int qep[8], uint32 qblock[2])
+{
+	bc7_code_apply_swap_mode456(qep, 4, qblock, 4);
+
+    for (uniform int k=0; k<5; k++) data[k] = 0;
+    uniform int pos = 0;
+
+	// mode 6
+    put_bits(data, &pos, 7, 64);
+    
+    // endpoints
+    for (uniform int p=0; p<4; p++)
+    {
+        put_bits(data, &pos, 7, qep[0+p]>>1);
+        put_bits(data, &pos, 7, qep[4+p]>>1);
+    }
+    
+    // p bits
+    put_bits(data, &pos, 1, qep[0]&1);
+    put_bits(data, &pos, 1, qep[4]&1);
+	
+	// quantized values
+    bc7_code_qblock(data, &pos, qblock, 4, 0);
+}
+
+
+//////////////////////////
+//       BC7 core
+
+inline void CompressBlockBC7_core(bc7_enc_state state[])
+{
+	if (state->mode_selection[0]) bc7_enc_mode02(state);
+	if (state->mode_selection[1]) bc7_enc_mode13(state);
+	if (state->mode_selection[1]) bc7_enc_mode7(state);
+	if (state->mode_selection[2]) bc7_enc_mode45(state);
+	if (state->mode_selection[3]) bc7_enc_mode6(state);
+}
+
+void bc7_enc_copy_settings(bc7_enc_state state[], uniform bc7_enc_settings settings[])
+{
+	state->channels = settings->channels;
+	
+	// mode02
+	state->mode_selection[0] = settings->mode_selection[0];
+	state->skip_mode2 = settings->skip_mode2;
+
+	state->refineIterations[0] = settings->refineIterations[0];
+	state->refineIterations[2] = settings->refineIterations[2];
+
+	// mode137
+	state->mode_selection[1] = settings->mode_selection[1];
+	state->fastSkipTreshold_mode1 = settings->fastSkipTreshold_mode1;
+	state->fastSkipTreshold_mode3 = settings->fastSkipTreshold_mode3;
+    state->fastSkipTreshold_mode7 = settings->fastSkipTreshold_mode7;
+
+	state->refineIterations[1] = settings->refineIterations[1];
+	state->refineIterations[3] = settings->refineIterations[3];
+    state->refineIterations[7] = settings->refineIterations[7];
+
+	// mode45
+	state->mode_selection[2] = settings->mode_selection[2];
+    
+    state->mode45_channel0 = settings->mode45_channel0;
+	state->refineIterations_channel = settings->refineIterations_channel;
+    state->refineIterations[4] = settings->refineIterations[4];
+	state->refineIterations[5] = settings->refineIterations[5];
+
+	// mode6
+	state->mode_selection[3] = settings->mode_selection[3];
+
+	state->refineIterations[6] = settings->refineIterations[6];
+}
+
+inline void CompressBlockBC7(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[], 
+							 uniform bc7_enc_settings settings[])
+{
+	bc7_enc_state _state;
+	varying bc7_enc_state* uniform state = &_state;
+
+    bc7_enc_copy_settings(state, settings);
+	load_block_interleaved_rgba(state->block, src, xx, yy);
+	state->best_err = 1e99;
+	state->opaque_err = compute_opaque_err(state->block, state->channels);
+
+	CompressBlockBC7_core(state);
+
+	store_data(dst, src->width, xx, yy, state->best_data, 4);
+}
+
+export void CompressBlocksBC7_ispc(uniform rgba_surface src[], uniform uint8 dst[], uniform bc7_enc_settings settings[])
+{
+	for (uniform int yy = 0; yy<src->height/4; yy++)
+	foreach (xx = 0 ... src->width/4)
+	{
+		CompressBlockBC7(src, xx, yy, dst, settings);
+	}
+}
+
+///////////////////////////////////////////////////////////
+//					 BC6H encoding
+
+struct bc6h_enc_settings
+{
+    bool slow_mode;
+    bool fast_mode;
+    int refineIterations_1p;
+    int refineIterations_2p;
+    int fastSkipTreshold;
+};
+
+struct bc6h_enc_state
+{
+    float block[64];
+
+    float best_err;
+    uint32 best_data[5];	// 4, +1 margin for skips
+
+    float rgb_bounds[6];
+    float max_span;
+    int max_span_idx;
+
+    int mode;
+    int epb;
+    int qbounds[8];
+
+    // settings
+    uniform bool slow_mode;
+    uniform bool fast_mode;
+    uniform int refineIterations_1p;
+    uniform int refineIterations_2p;
+    uniform int fastSkipTreshold;
+};
+
+void bc6h_code_2p(uint32 data[5], int pqep[], uint32 qblock[2], int part_id, int mode);
+void bc6h_code_1p(uint32 data[5], int qep[8], uint32 qblock[2], int mode);
+
+///////////////////////////
+//   BC6H format data
+
+inline uniform int get_mode_prefix(uniform int mode)
+{
+    static uniform const int mode_prefix_table[] =
+    {
+        0, 1, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15
+    };
+
+    return mode_prefix_table[mode];
+}
+
+inline uniform float get_span(uniform int mode)
+{
+    static uniform const float span_table[] =
+    {
+        0.9 * 0xFFFF /  64, //  (0) 4 / 10
+        0.9 * 0xFFFF /   4, //  (1) 5 / 7
+        0.8 * 0xFFFF / 256, //  (2) 3 / 11
+        -1, -1,
+        0.9 * 0xFFFF /  32, //  (5) 4 / 9
+        0.9 * 0xFFFF /  16, //  (6) 4 / 8
+        -1, -1,
+        0xFFFF,             //  (9) absolute
+        0xFFFF,             // (10) absolute
+        0.95 * 0xFFFF / 8,  // (11) 8 / 11
+        0.95 * 0xFFFF / 32, // (12) 7 / 12
+        6,                  // (13) 3 / 16
+    };
+
+    uniform int span = span_table[mode];
+    assert(span > 0);
+    return span;
+}
+
+inline uniform int get_mode_bits(uniform int mode)
+{
+    static uniform const int mode_bits_table[] =
+    {
+        10,  7, 11, -1, -1,
+         9,  8, -1, -1,  6,
+        10, 11, 12, 16,
+    };
+
+    uniform int mode_bits = mode_bits_table[mode];
+    assert(mode_bits > 0);
+    return mode_bits;
+}
+
+///////////////////////////
+// endpoint quantization
+
+inline int unpack_to_uf16(uint32 v, int bits)
+{
+    if (bits >= 15) return v;
+    if (v == 0) return 0;
+    if (v == (1<<bits)-1) return 0xFFFF;
+
+    return (v * 2 + 1) << (15-bits);
+}
+
+void ep_quant_bc6h(int qep[], float ep[], int bits, uniform int pairs)
+{
+    int levels = 1 << bits;
+
+    for (uniform int i = 0; i < 8 * pairs; i++)
+    {
+        int v = ((int)(ep[i] / (256 * 256.0f - 1) * (levels - 1) + 0.5));
+        qep[i] = clamp(v, 0, levels - 1);
+    }
+}
+
+void ep_dequant_bc6h(float ep[], int qep[], int bits, uniform int pairs)
+{
+    for (uniform int i = 0; i < 8 * pairs; i++)
+        ep[i] = unpack_to_uf16(qep[i], bits);
+}
+
+void ep_quant_dequant_bc6h(bc6h_enc_state state[], int qep[], float ep[], uniform int pairs)
+{
+    int bits = state->epb;
+    ep_quant_bc6h(qep, ep, bits, pairs);
+
+    for (uniform int i = 0; i < 2 * pairs; i++)
+    for (uniform int p = 0; p < 3; p++)
+    {
+        qep[i * 4 + p] = clamp(qep[i * 4 + p], state->qbounds[p], state->qbounds[4 + p]);
+    }
+
+    ep_dequant_bc6h(ep, qep, bits, pairs);
+
+}
+
+//////////////////////////
+// parameter estimation
+
+float bc6h_enc_2p_part_fast(bc6h_enc_state state[], int qep[16], uint32 qblock[2], int part_id)
+{
+    uint32 pattern = get_pattern(part_id);
+    uniform int bits = 3;
+    uniform int pairs = 2;
+    uniform int channels = 3;
+
+    float ep[16];
+    for (uniform int j = 0; j<pairs; j++)
+    {
+        int mask = get_pattern_mask(part_id, j);
+        block_segment_core(&ep[j * 8], state->block, mask, channels);
+    }
+
+    ep_quant_dequant_bc6h(state, qep, ep, 2);
+
+    float total_err = block_quant(qblock, state->block, bits, ep, pattern, channels);
+    return total_err;
+
+}
+
+void bc6h_enc_2p_list(bc6h_enc_state state[], int part_list[], uniform int part_count)
+{
+    if (part_count == 0) return;
+    uniform int bits = 3;
+    uniform int pairs = 2;
+    uniform int channels = 3;
+
+    int best_qep[24];
+    uint32 best_qblock[2];
+    int best_part_id = -1;
+    float best_err = 1e99;
+
+    for (uniform int part = 0; part<part_count; part++)
+    {
+        int part_id = part_list[part] & 31;
+
+        int qep[24];
+        uint32 qblock[2];
+        float err = bc6h_enc_2p_part_fast(state, qep, qblock, part_id);
+
+        if (err<best_err)
+        {
+            for (uniform int i = 0; i<8 * pairs; i++) best_qep[i] = qep[i];
+            for (uniform int k = 0; k<2; k++) best_qblock[k] = qblock[k];
+            best_part_id = part_id;
+            best_err = err;
+        }
+    }
+
+    // refine
+    uniform int refineIterations = state->refineIterations_2p;
+    for (uniform int _ = 0; _<refineIterations; _++)
+    {
+        float ep[24];
+        for (uniform int j = 0; j<pairs; j++)
+        {
+            int mask = get_pattern_mask(best_part_id, j);
+            opt_endpoints(&ep[j * 8], state->block, bits, best_qblock, mask, channels);
+        }
+
+        int qep[24];
+        uint32 qblock[2];
+        ep_quant_dequant_bc6h(state, qep, ep, 2);
+
+        uint32 pattern = get_pattern(best_part_id);
+        float err = block_quant(qblock, state->block, bits, ep, pattern, channels);
+
+        if (err<best_err)
+        {
+            for (uniform int i = 0; i<8 * pairs; i++) best_qep[i] = qep[i];
+            for (uniform int k = 0; k<2; k++) best_qblock[k] = qblock[k];
+            best_err = err;
+        }
+    }
+
+    if (best_err<state->best_err)
+    {
+        state->best_err = best_err;
+        bc6h_code_2p(state->best_data, best_qep, best_qblock, best_part_id, state->mode);
+    }
+}
+
+void bc6h_enc_2p(bc6h_enc_state state[])
+{
+    float full_stats[15];
+    compute_stats_masked(full_stats, state->block, -1, 3);
+
+    int part_list[32];
+    for (uniform int part = 0; part < 32; part++)
+    {
+        int mask = get_pattern_mask(part, 0);
+        float bound12 = block_pca_bound_split(state->block, mask, full_stats, 3);
+        int bound = (int)(bound12);
+        part_list[part] = part + bound * 64;
+    }
+    
+    partial_sort_list(part_list, 32, state->fastSkipTreshold);
+    bc6h_enc_2p_list(state, part_list, state->fastSkipTreshold);
+}
+
+void bc6h_enc_1p(bc6h_enc_state state[])
+{
+    float ep[8];
+    block_segment_core(ep, state->block, -1, 3);
+
+    int qep[8];
+    ep_quant_dequant_bc6h(state, qep, ep, 1);
+
+    uint32 qblock[2];
+    float err = block_quant(qblock, state->block, 4, ep, 0, 3);
+
+    // refine
+    uniform int refineIterations = state->refineIterations_1p;
+    for (uniform int i = 0; i<refineIterations; i++)
+    {
+        opt_endpoints(ep, state->block, 4, qblock, -1, 3);
+        ep_quant_dequant_bc6h(state, qep, ep, 1);
+        err = block_quant(qblock, state->block, 4, ep, 0, 3);
+    }
+
+    if (err < state->best_err)
+    {
+        state->best_err = err;
+        bc6h_code_1p(state->best_data, qep, qblock, state->mode);
+    }
+}
+
+inline void compute_qbounds(bc6h_enc_state state[], float rgb_span[3])
+{
+    float bounds[8];
+    for (uniform int p = 0; p < 3; p++)
+    {
+        float middle = (state->rgb_bounds[p] + state->rgb_bounds[3 + p]) / 2;
+
+        bounds[  p] = middle - rgb_span[p] / 2;
+        bounds[4+p] = middle + rgb_span[p] / 2;
+    }
+
+    ep_quant_bc6h(state->qbounds, bounds, state->epb, 1);
+}
+
+void compute_qbounds(bc6h_enc_state state[], float span)
+{
+    float rgb_span[3] = { span, span, span };
+    compute_qbounds(state, rgb_span);
+}
+
+void compute_qbounds2(bc6h_enc_state state[], float span, int max_span_idx)
+{
+    float rgb_span[3] = { span, span, span };
+    for (uniform int p = 0; p < 3; p++)
+    {
+        rgb_span[p] *= (p == max_span_idx) ? 2 : 1;
+    }
+    compute_qbounds(state, rgb_span);
+}
+
+void bc6h_test_mode(bc6h_enc_state state[], uniform int mode, uniform bool enc, uniform float margin)
+{
+    uniform int mode_bits = get_mode_bits(mode);
+    uniform float span = get_span(mode);
+    float max_span = state->max_span;
+    int max_span_idx = state->max_span_idx;
+
+    if (max_span * margin > span) return;
+
+    if (mode >= 10)
+    {
+        state->epb = mode_bits;
+        state->mode = mode;
+
+        compute_qbounds(state, span);
+        if (enc) bc6h_enc_1p(state);
+    }
+    else if (mode <= 1 || mode == 5 || mode == 9)
+    {
+        state->epb = mode_bits;
+        state->mode = mode;
+
+        compute_qbounds(state, span);
+        if (enc) bc6h_enc_2p(state);
+    }
+    else
+    {
+        state->epb = mode_bits;
+        state->mode = mode + max_span_idx;       
+        
+        compute_qbounds2(state, span, max_span_idx);
+        if (enc) bc6h_enc_2p(state);
+    }
+}
+
+//////////////////////////
+// BC6H bitstream coding
+
+int bit_at(int v, uniform int pos)
+{
+    return (v >> pos) & 1;
+}
+
+uint32 reverse_bits(uint32 v, uniform int bits)
+{
+    if (bits == 2)
+    {
+        return (v >> 1) + (v & 1) * 2;
+    }
+    if (bits == 6)
+    {
+        v = (v & 0x5555) * 2 + ((v >> 1) & 0x5555);
+        return (v >> 4) + ((v >> 2) & 3) * 4 + (v & 3) * 16;
+    }
+    else
+    {
+        assert(false);
+    }
+}
+
+void bc6h_pack(uint32 packed[], int qep[], int mode)
+{
+    if (mode == 0)
+    {
+        int pred_qep[16];
+        for (uniform int p = 0; p < 3; p++)
+        {
+            pred_qep[     p] = qep[p];
+            pred_qep[ 4 + p] = (qep[ 4 + p] - qep[p]) & 31;
+            pred_qep[ 8 + p] = (qep[ 8 + p] - qep[p]) & 31;
+            pred_qep[12 + p] = (qep[12 + p] - qep[p]) & 31;
+        }
+
+        for (uniform int i = 1; i < 4; i++)
+        for (uniform int p = 0; p < 3; p++)
+        {
+            assert(       qep[i * 4 + p] - qep[p] <= 15);
+            assert(-16 <= qep[i * 4 + p] - qep[p]);
+        }
+        
+        /*
+            g2[4], b2[4], b3[4], 
+            r0[9:0], 
+            g0[9:0], 
+            b0[9:0], 
+            r1[4:0], g3[4], g2[3:0],
+            g1[4:0], b3[0], g3[3:0], 
+            b1[4:0], b3[1], b2[3:0], 
+            r2[4:0], b3[2], 
+            r3[4:0], b3[3]
+        */
+
+        uint32 pqep[10];
+
+        pqep[4] = pred_qep[4] + (pred_qep[ 8 + 1] & 15) * 64;
+        pqep[5] = pred_qep[5] + (pred_qep[12 + 1] & 15) * 64;
+        pqep[6] = pred_qep[6] + (pred_qep[ 8 + 2] & 15) * 64;
+
+        pqep[4] += bit_at(pred_qep[12 + 1], 4) << 5;
+        pqep[5] += bit_at(pred_qep[12 + 2], 0) << 5;
+        pqep[6] += bit_at(pred_qep[12 + 2], 1) << 5;
+
+        pqep[8] = pred_qep[ 8] + bit_at(pred_qep[12 + 2], 2) * 32;
+        pqep[9] = pred_qep[12] + bit_at(pred_qep[12 + 2], 3) * 32;
+
+        packed[0] = get_mode_prefix(0); 
+        packed[0] += bit_at(pred_qep[ 8 + 1], 4) << 2;
+        packed[0] += bit_at(pred_qep[ 8 + 2], 4) << 3;
+        packed[0] += bit_at(pred_qep[12 + 2], 4) << 4;
+
+        packed[1] = (pred_qep[2] << 20) + (pred_qep[1] << 10) + pred_qep[0];
+        packed[2] = (pqep[6] << 20) + (pqep[5] << 10) + pqep[4];
+        packed[3] = (pqep[9] << 6) + pqep[8];
+    }
+    else if (mode == 1)
+    {
+        int pred_qep[16];
+        for (uniform int p = 0; p < 3; p++)
+        {
+            pred_qep[     p] = qep[p];
+            pred_qep[ 4 + p] = (qep[ 4 + p] - qep[p]) & 63;
+            pred_qep[ 8 + p] = (qep[ 8 + p] - qep[p]) & 63;
+            pred_qep[12 + p] = (qep[12 + p] - qep[p]) & 63;
+        }
+
+        for (uniform int i = 1; i < 4; i++)
+        for (uniform int p = 0; p < 3; p++)
+        {
+            assert(       qep[i * 4 + p] - qep[p] <= 31);
+            assert(-32 <= qep[i * 4 + p] - qep[p]);
+        }
+        
+        /*
+            g2[5], g3[4], g3[5], 
+            r0[6:0], b3[0], b3[1], b2[4], 
+            g0[6:0], b2[5], b3[2], g2[4], 
+            b0[6:0], b3[3], b3[5], b3[4], 
+            r1[5:0], g2[3:0], 
+            g1[5:0], g3[3:0], 
+            b1[5:0], b2[3:0], 
+            r2[5:0], 
+            r3[5:0]
+        */
+
+        uint32 pqep[8];
+
+        pqep[0] = pred_qep[0];
+        pqep[0] += bit_at(pred_qep[12 + 2], 0) << 7;
+        pqep[0] += bit_at(pred_qep[12 + 2], 1) << 8;
+        pqep[0] += bit_at(pred_qep[ 8 + 2], 4) << 9;
+
+        pqep[1] = pred_qep[1];
+        pqep[1] += bit_at(pred_qep[ 8 + 2], 5) << 7;
+        pqep[1] += bit_at(pred_qep[12 + 2], 2) << 8;
+        pqep[1] += bit_at(pred_qep[ 8 + 1], 4) << 9;
+
+        pqep[2] = pred_qep[2];
+        pqep[2] += bit_at(pred_qep[12 + 2], 3) << 7;
+        pqep[2] += bit_at(pred_qep[12 + 2], 5) << 8;
+        pqep[2] += bit_at(pred_qep[12 + 2], 4) << 9;
+
+        pqep[4] = pred_qep[4] + (pred_qep[ 8 + 1] & 15) * 64;
+        pqep[5] = pred_qep[5] + (pred_qep[12 + 1] & 15) * 64;
+        pqep[6] = pred_qep[6] + (pred_qep[ 8 + 2] & 15) * 64;
+
+        packed[0] = get_mode_prefix(1); 
+        packed[0] += bit_at(pred_qep[ 8 + 1], 5) << 2;
+        packed[0] += bit_at(pred_qep[12 + 1], 4) << 3;
+        packed[0] += bit_at(pred_qep[12 + 1], 5) << 4;
+
+        packed[1] = (pqep[2] << 20) + (pqep[1] << 10) + pqep[0];
+        packed[2] = (pqep[6] << 20) + (pqep[5] << 10) + pqep[4];
+        packed[3] = (pred_qep[12] << 6) + pred_qep[8];
+    }
+    else if (mode == 2 || mode == 3 || mode == 4)
+    {
+        /*
+            r0[9:0], g0[9:0], b0[9:0], 
+            r1[3:0], xx[y], xx[y], g2[3:0], 
+            g1[3:0], xx[y], xx[y], g3[3:0], 
+            b1[3:0], xx[y], xx[y], b2[3:0], 
+            r2[3:0], xx[y], xx[y], 
+            r3[3:0], xx[y], xx[y]
+        */
+
+        int dqep[16];
+        for (uniform int p = 0; p < 3; p++)
+        {
+            int mask = 15;
+            if (p == mode - 2) mask = 31;
+            dqep[p] = qep[p];
+            dqep[ 4 + p] = (qep[ 4 + p] - qep[p]) & mask;
+            dqep[ 8 + p] = (qep[ 8 + p] - qep[p]) & mask;
+            dqep[12 + p] = (qep[12 + p] - qep[p]) & mask;
+        }
+
+        for (uniform int i = 1; i < 4; i++)
+        for (uniform int p = 0; p < 3; p++)
+        {
+            int bits = 4;
+            if (p == mode - 2) bits = 5;
+            //assert(                qep[i * 4 + p] - qep[p] <= (1<<bits)/2 - 1);
+            //assert(-(1<<bits)/2 <= qep[i * 4 + p] - qep[p]);
+        }
+        
+        uint32 pqep[10];
+
+        pqep[0] = dqep[0] & 1023;
+        pqep[1] = dqep[1] & 1023;
+        pqep[2] = dqep[2] & 1023;
+
+        pqep[4] = dqep[4] + (dqep[ 8 + 1] & 15) * 64;
+        pqep[5] = dqep[5] + (dqep[12 + 1] & 15) * 64;
+        pqep[6] = dqep[6] + (dqep[ 8 + 2] & 15) * 64;
+
+        pqep[8] = dqep[8];
+        pqep[9] = dqep[12];
+
+        if (mode == 2)
+        {
+            /*
+                r0[9:0], g0[9:0], b0[9:0], 
+                r1[3:0], r1[4],  r0[10], g2[3:0], 
+                g1[3:0], g0[10], b3[0],  g3[3:0], 
+                b1[3:0], b0[10], b3[1],  b2[3:0], 
+                r2[3:0], r2[4],  b3[2], 
+                r3[3:0], r3[4],  b3[3]
+            */
+
+            packed[0] = get_mode_prefix(2);
+
+            //
+            pqep[5] += bit_at(dqep[0 + 1], 10) << 4;
+            pqep[6] += bit_at(dqep[0 + 2], 10) << 4;
+            //
+            //
+
+            pqep[4] += bit_at(dqep[0 + 0], 10) << 5;
+            pqep[5] += bit_at(dqep[12 + 2], 0) << 5;
+            pqep[6] += bit_at(dqep[12 + 2], 1) << 5;
+            pqep[8] += bit_at(dqep[12 + 2], 2) << 5;
+            pqep[9] += bit_at(dqep[12 + 2], 3) << 5;
+        }
+        if (mode == 3)
+        {
+            /*
+                r0[9:0], g0[9:0], b0[9:0], 
+                r1[3:0], r0[10], g3[4],  g2[3:0], 
+                g1[3:0], g1[4],  g0[10], g3[3:0], 
+                b1[3:0], b0[10], b3[1],  b2[3:0], 
+                r2[3:0], b3[0],  b3[2], 
+                r3[3:0], g2[4],  b3[3]
+            */
+
+            packed[0] = get_mode_prefix(3);
+
+            pqep[4] += bit_at(dqep[0 + 0], 10) << 4;
+            //
+            pqep[6] += bit_at(dqep[0 + 2], 10) << 4;
+            pqep[8] += bit_at(dqep[12 + 2], 0) << 4;
+            pqep[9] += bit_at(dqep[ 8 + 1], 4) << 4;
+
+            pqep[4] += bit_at(dqep[12 + 1], 4) << 5;
+            pqep[5] += bit_at(dqep[0 + 1], 10) << 5;
+            pqep[6] += bit_at(dqep[12 + 2], 1) << 5;
+            pqep[8] += bit_at(dqep[12 + 2], 2) << 5;
+            pqep[9] += bit_at(dqep[12 + 2], 3) << 5;
+        }
+        if (mode == 4)
+        {
+            /*
+                r0[9:0], g0[9:0], b0[9:0], 
+                r1[3:0], r0[10], b2[4],  g2[3:0], 
+                g1[3:0], g0[10], b3[0],  g3[3:0], 
+                b1[3:0], b1[4],  b0[10], b2[3:0], 
+                r2[3:0], b3[1],  b3[2], 
+                r3[3:0], b3[4],  b3[3]
+            */
+
+            packed[0] = get_mode_prefix(4);
+
+            pqep[4] += bit_at(dqep[0 + 0], 10) << 4;
+            pqep[5] += bit_at(dqep[0 + 1], 10) << 4;
+            //
+            pqep[8] += bit_at(dqep[12 + 2], 1) << 4;
+            pqep[9] += bit_at(dqep[12 + 2], 4) << 4;
+
+            pqep[4] += bit_at(dqep[ 8 + 2], 4) << 5;
+            pqep[5] += bit_at(dqep[12 + 2], 0) << 5;
+            pqep[6] += bit_at(dqep[0 + 2], 10) << 5;
+            pqep[8] += bit_at(dqep[12 + 2], 2) << 5;
+            pqep[9] += bit_at(dqep[12 + 2], 3) << 5;
+        }
+
+        packed[1] = (pqep[2] << 20) + (pqep[1] << 10) + pqep[0];
+        packed[2] = (pqep[6] << 20) + (pqep[5] << 10) + pqep[4];
+        packed[3] = (pqep[9] << 6) + pqep[8];
+    }
+    else if (mode == 5)
+    {
+        int dqep[16];
+        for (uniform int p = 0; p < 3; p++)
+        {
+            dqep[p] = qep[p];
+            dqep[ 4 + p] = (qep[ 4 + p] - qep[p]) & 31;
+            dqep[ 8 + p] = (qep[ 8 + p] - qep[p]) & 31;
+            dqep[12 + p] = (qep[12 + p] - qep[p]) & 31;
+        }
+
+        for (uniform int i = 1; i < 4; i++)
+        for (uniform int p = 0; p < 3; p++)
+        {
+            assert(       qep[i * 4 + p] - qep[p] <= 15);
+            assert(-16 <= qep[i * 4 + p] - qep[p]);
+        }
+     
+        /*
+            r0[8:0], b2[4],
+            g0[8:0], g2[4], 
+            b0[8:0], b3[4], 
+            r1[4:0], g3[4], g2[3:0],
+            g1[4:0], b3[0], g3[3:0], 
+            b1[4:0], b3[1], b2[3:0], 
+            r2[4:0], b3[2], 
+            r3[4:0], b3[3]
+        */
+
+        uint32 pqep[10];
+
+        pqep[0] = dqep[0];
+        pqep[1] = dqep[1];
+        pqep[2] = dqep[2];
+        pqep[4] = dqep[4] + (dqep[ 8 + 1] & 15) * 64;
+        pqep[5] = dqep[5] + (dqep[12 + 1] & 15) * 64;
+        pqep[6] = dqep[6] + (dqep[ 8 + 2] & 15) * 64;
+        pqep[8] = dqep[8];
+        pqep[9] = dqep[12];
+
+        pqep[0] += bit_at(dqep[ 8 + 2], 4) << 9;
+        pqep[1] += bit_at(dqep[ 8 + 1], 4) << 9;
+        pqep[2] += bit_at(dqep[12 + 2], 4) << 9;
+        
+        pqep[4] += bit_at(dqep[12 + 1], 4) << 5;
+        pqep[5] += bit_at(dqep[12 + 2], 0) << 5;
+        pqep[6] += bit_at(dqep[12 + 2], 1) << 5;
+
+        pqep[8] += bit_at(dqep[12 + 2], 2) << 5;
+        pqep[9] += bit_at(dqep[12 + 2], 3) << 5;
+
+        packed[0] = get_mode_prefix(5); 
+
+        packed[1] = (pqep[2] << 20) + (pqep[1] << 10) + pqep[0];
+        packed[2] = (pqep[6] << 20) + (pqep[5] << 10) + pqep[4];
+        packed[3] = (pqep[9] << 6) + pqep[8];
+    }
+    else if (mode == 6 || mode == 7 || mode == 8)
+    {
+        /*
+            r0[7:0], xx[y], b2[4],
+            g0[7:0], xx[y], g2[4],
+            b0[7:0], xx[y], b3[4],
+            r1[4:0], xx[y], g2[3:0],
+            g1[4:0], xx[y], g3[3:0],
+            b1[4:0], xx[y], b2[3:0],
+            r2[4:0], xx[y],
+            r3[4:0], xx[y]
+        */
+
+        int dqep[16];
+        for (uniform int p = 0; p < 3; p++)
+        {
+            int mask = 31;
+            if (p == mode - 6) mask = 63;
+            dqep[p] = qep[p];
+            dqep[ 4 + p] = (qep[ 4 + p] - qep[p]) & mask;
+            dqep[ 8 + p] = (qep[ 8 + p] - qep[p]) & mask;
+            dqep[12 + p] = (qep[12 + p] - qep[p]) & mask;
+        }
+
+        for (uniform int i = 1; i < 4; i++)
+        for (uniform int p = 0; p < 3; p++)
+        {
+            int bits = 5;
+            if (p == mode - 6) bits = 6;
+            //assert(                qep[i * 4 + p] - qep[p] <= (1<<bits)/2 - 1);
+            //assert(-(1<<bits)/2 <= qep[i * 4 + p] - qep[p]);
+        }
+        
+        uint32 pqep[10];
+
+        pqep[0] = dqep[0];
+        pqep[0] += bit_at(dqep[ 8 + 2], 4) << 9;
+
+        pqep[1] = dqep[1];
+        pqep[1] += bit_at(dqep[ 8 + 1], 4) << 9;
+
+        pqep[2] = dqep[2];
+        pqep[2] += bit_at(dqep[12 + 2], 4) << 9;
+
+        pqep[4] = dqep[4] + (dqep[ 8 + 1] & 15) * 64;
+        pqep[5] = dqep[5] + (dqep[12 + 1] & 15) * 64;
+        pqep[6] = dqep[6] + (dqep[ 8 + 2] & 15) * 64;
+
+        pqep[8] = dqep[8];
+        pqep[9] = dqep[12];
+
+        if (mode == 6)
+        {
+            /*
+                r0[7:0], g3[4], b2[4],
+                g0[7:0], b3[2], g2[4],
+                b0[7:0], b3[3], b3[4],
+                r1[4:0], r1[5], g2[3:0],
+                g1[4:0], b3[0], g3[3:0],
+                b1[4:0], b3[1], b2[3:0],
+                r2[5:0],
+                r3[5:0]
+            */
+
+            packed[0] = get_mode_prefix(6);
+
+            pqep[0] += bit_at(dqep[12 + 1], 4) << 8;
+            pqep[1] += bit_at(dqep[12 + 2], 2) << 8;
+            pqep[2] += bit_at(dqep[12 + 2], 3) << 8;
+            //
+            pqep[5] += bit_at(dqep[12 + 2], 0) << 5;
+            pqep[6] += bit_at(dqep[12 + 2], 1) << 5;
+            //
+            //
+        }
+        if (mode == 7)
+        {
+            /*
+                r0[7:0], b3[0], b2[4],
+                g0[7:0], g2[5], g2[4],
+                b0[7:0], g3[5], b3[4],
+                r1[4:0], g3[4], g2[3:0],
+                g1[4:0], g1[5], g3[3:0],
+                b1[4:0], b3[1], b2[3:0],
+                r2[4:0], b3[2],
+                r3[4:0], b3[3]
+            */
+
+            packed[0] = get_mode_prefix(7);
+
+            pqep[0] += bit_at(dqep[12 + 2], 0) << 8;
+            pqep[1] += bit_at(dqep[ 8 + 1], 5) << 8;
+            pqep[2] += bit_at(dqep[12 + 1], 5) << 8;
+            pqep[4] += bit_at(dqep[12 + 1], 4) << 5;
+            //
+            pqep[6] += bit_at(dqep[12 + 2], 1) << 5;
+            pqep[8] += bit_at(dqep[12 + 2], 2) << 5;
+            pqep[9] += bit_at(dqep[12 + 2], 3) << 5;
+        }
+        if (mode == 8)
+        {
+            /*
+                r0[7:0], b3[1], b2[4],
+                g0[7:0], b2[5], g2[4],
+                b0[7:0], b3[5], b3[4],
+                r1[4:0], g3[4], g2[3:0],
+                g1[4:0], b3[0], g3[3:0],
+                b1[4:0], b1[5], b2[3:0],
+                r2[4:0], b3[2],
+                r3[4:0], b3[3]
+            */
+
+            packed[0] = get_mode_prefix(8);
+
+            pqep[0] += bit_at(dqep[12 + 2], 1) << 8;
+            pqep[1] += bit_at(dqep[ 8 + 2], 5) << 8;
+            pqep[2] += bit_at(dqep[12 + 2], 5) << 8;
+            pqep[4] += bit_at(dqep[12 + 1], 4) << 5;
+            pqep[5] += bit_at(dqep[12 + 2], 0) << 5;
+            //
+            pqep[8] += bit_at(dqep[12 + 2], 2) << 5;
+            pqep[9] += bit_at(dqep[12 + 2], 3) << 5;
+        }
+
+        packed[1] = (pqep[2] << 20) + (pqep[1] << 10) + pqep[0];
+        packed[2] = (pqep[6] << 20) + (pqep[5] << 10) + pqep[4];
+        packed[3] = (pqep[9] << 6) + pqep[8];
+    }
+    else if (mode == 9)
+    {
+        /*
+            r0[5:0], g3[4], b3[0], b3[1], b2[4], // 10
+            g0[5:0], g2[5], b2[5], b3[2], g2[4], // 10
+            b0[5:0], g3[5], b3[3], b3[5], b3[4], // 10
+            r1[5:0], g2[3:0],  // 10
+            g1[5:0], g3[3:0],  // 10
+            b1[5:0], b2[3:0],  // 10
+            r2[5:0],  // 6
+            r3[5:0]   // 6
+        */
+
+        uint32 pqep[10];
+
+        pqep[0] = qep[0];
+        pqep[0] += bit_at(qep[12 + 1], 4) << 6;
+        pqep[0] += bit_at(qep[12 + 2], 0) << 7;
+        pqep[0] += bit_at(qep[12 + 2], 1) << 8;
+        pqep[0] += bit_at(qep[ 8 + 2], 4) << 9;
+
+        pqep[1] = qep[1];
+        pqep[1] += bit_at(qep[ 8 + 1], 5) << 6;
+        pqep[1] += bit_at(qep[ 8 + 2], 5) << 7;
+        pqep[1] += bit_at(qep[12 + 2], 2) << 8;
+        pqep[1] += bit_at(qep[ 8 + 1], 4) << 9;
+
+        pqep[2] = qep[2];
+        pqep[2] += bit_at(qep[12 + 1], 5) << 6;
+        pqep[2] += bit_at(qep[12 + 2], 3) << 7;
+        pqep[2] += bit_at(qep[12 + 2], 5) << 8;
+        pqep[2] += bit_at(qep[12 + 2], 4) << 9;
+
+        pqep[4] = qep[4] + (qep[ 8 + 1] & 15) * 64;
+        pqep[5] = qep[5] + (qep[12 + 1] & 15) * 64;
+        pqep[6] = qep[6] + (qep[ 8 + 2] & 15) * 64;
+
+        packed[0] = get_mode_prefix(9);
+        packed[1] = (pqep[2] << 20) + (pqep[1] << 10) + pqep[0];
+        packed[2] = (pqep[6] << 20) + (pqep[5] << 10) + pqep[4];
+        packed[3] = (qep[12] << 6) + qep[8];
+    }
+    else if (mode == 10)
+    {
+        // the only mode with nothing to do ~
+
+        packed[0] = get_mode_prefix(10);
+        packed[1] = (qep[2] << 20) + (qep[1] << 10) + qep[0];
+        packed[2] = (qep[6] << 20) + (qep[5] << 10) + qep[4];
+    }
+    else if (mode == 11)
+    {
+        int dqep[8];
+        for (uniform int p = 0; p < 3; p++)
+        {
+            dqep[p] = qep[p];
+            dqep[4 + p] = (qep[4 + p] - qep[p]) & 511;
+        }
+            
+        for (uniform int i = 1; i < 2; i++)
+        for (uniform int p = 0; p < 3; p++)
+        {
+            assert(        qep[i * 4 + p] - qep[p] <= 255);
+            assert(-256 <= qep[i * 4 + p] - qep[p]);
+        }
+
+        /*
+            r0[9:0], g0[9:0], b0[9:0],
+            r1[8:0], r0[10],
+            g1[8:0], g0[10],
+            b1[8:0], b0[10]
+        */
+
+        uint32 pqep[8];
+
+        pqep[0] = dqep[0] & 1023;
+        pqep[1] = dqep[1] & 1023;
+        pqep[2] = dqep[2] & 1023;
+
+        pqep[4] = dqep[4] + (dqep[0] >> 10) * 512;
+        pqep[5] = dqep[5] + (dqep[1] >> 10) * 512;
+        pqep[6] = dqep[6] + (dqep[2] >> 10) * 512;
+
+        packed[0] = get_mode_prefix(11);
+        packed[1] = (pqep[2] << 20) + (pqep[1] << 10) + pqep[0];
+        packed[2] = (pqep[6] << 20) + (pqep[5] << 10) + pqep[4];
+    }    
+    else if (mode == 12)
+    {
+        int dqep[8];
+        for (uniform int p = 0; p < 3; p++)
+        {
+            dqep[p] = qep[p];
+            dqep[4 + p] = (qep[4 + p] - qep[p]) & 255;
+        }
+            
+        for (uniform int i = 1; i < 2; i++)
+        for (uniform int p = 0; p < 3; p++)
+        {
+            assert(        qep[i * 4 + p] - qep[p] <= 127);
+            assert(-128 <= qep[i * 4 + p] - qep[p]);
+        }
+
+        /*
+            r0[9:0], g0[9:0], b0[9:0], 
+            r1[7:0], r0[10:11], 
+            g1[7:0], g0[10:11],
+            b1[7:0], b0[10:11]
+        */
+
+        uint32 pqep[8];
+
+        pqep[0] = dqep[0] & 1023;
+        pqep[1] = dqep[1] & 1023;
+        pqep[2] = dqep[2] & 1023;
+
+        pqep[4] = dqep[4] + reverse_bits(dqep[0] >> 10, 2) * 256;
+        pqep[5] = dqep[5] + reverse_bits(dqep[1] >> 10, 2) * 256;
+        pqep[6] = dqep[6] + reverse_bits(dqep[2] >> 10, 2) * 256;
+
+        packed[0] = get_mode_prefix(12);
+        packed[1] = (pqep[2] << 20) + (pqep[1] << 10) + pqep[0];
+        packed[2] = (pqep[6] << 20) + (pqep[5] << 10) + pqep[4];
+    }
+    else if (mode == 13)
+    {
+        int dqep[8];
+        for (uniform int p = 0; p < 3; p++)
+        {
+            dqep[p] = qep[p];
+            dqep[4 + p] = (qep[4 + p] - qep[p]) & 15;
+        }
+            
+        for (uniform int i = 1; i < 2; i++)
+        for (uniform int p = 0; p < 3; p++)
+        {
+            assert(      qep[i * 4 + p] - qep[p] <= 7);
+            assert(-8 <= qep[i * 4 + p] - qep[p]);
+        }
+
+        /*
+            r0[9:0], g0[9:0], b0[9:0],
+            r1[3:0], r0[10:15],
+            g1[3:0], g0[10:15],
+            b1[3:0], b0[10:15]
+        */
+
+        uint32 pqep[8];
+
+        pqep[0] = dqep[0] & 1023;
+        pqep[1] = dqep[1] & 1023;
+        pqep[2] = dqep[2] & 1023;
+
+        pqep[4] = dqep[4] + reverse_bits(dqep[0] >> 10, 6) * 16;
+        pqep[5] = dqep[5] + reverse_bits(dqep[1] >> 10, 6) * 16;
+        pqep[6] = dqep[6] + reverse_bits(dqep[2] >> 10, 6) * 16;
+
+        packed[0] = get_mode_prefix(13);
+        packed[1] = (pqep[2] << 20) + (pqep[1] << 10) + pqep[0];
+        packed[2] = (pqep[6] << 20) + (pqep[5] << 10) + pqep[4];
+    }
+    else
+    {
+        assert(false);
+    }
+}
+
+void bc6h_code_2p(uint32 data[5], int qep[], uint32 qblock[2], int part_id, int mode)
+{
+	uniform int bits = 3;
+    uniform int pairs = 2;
+    uniform int channels = 3;
+
+    int flips = bc7_code_apply_swap_mode01237(qep, qblock, 1, part_id);
+
+	for (uniform int k=0; k<5; k++) data[k] = 0;
+    uniform int pos = 0;
+
+    uint32 packed[4];
+    bc6h_pack(packed, qep, mode);
+
+    // mode
+    put_bits(data, &pos, 5, packed[0]);
+
+    // endpoints
+    put_bits(data, &pos, 30, packed[1]);
+    put_bits(data, &pos, 30, packed[2]);
+    put_bits(data, &pos, 12, packed[3]);
+    
+    // partition
+    put_bits(data, &pos, 5, part_id);
+
+    // quantized values
+    bc7_code_qblock(data, &pos, qblock, bits, flips);
+	bc7_code_adjust_skip_mode01237(data, 1, part_id);
+}
+
+void bc6h_code_1p(uint32 data[5], int qep[8], uint32 qblock[2], int mode)
+{
+    bc7_code_apply_swap_mode456(qep, 4, qblock, 4);
+
+    for (uniform int k = 0; k<5; k++) data[k] = 0;
+    uniform int pos = 0;
+
+    uint32 packed[4];
+    bc6h_pack(packed, qep, mode);
+
+    // mode
+    put_bits(data, &pos, 5, packed[0]);
+
+    // endpoints
+    put_bits(data, &pos, 30, packed[1]);
+    put_bits(data, &pos, 30, packed[2]);
+    
+    // quantized values
+    bc7_code_qblock(data, &pos, qblock, 4, 0);
+}
+
+//////////////////////////
+//       BC6H core
+
+void bc6h_setup(bc6h_enc_state state[])
+{
+    for (uniform int p = 0; p < 3; p++)
+    {
+        state->rgb_bounds[p  ] = 0xFFFF;
+        state->rgb_bounds[3+p] = 0;
+    }
+
+    // uf16 conversion, min/max
+    for (uniform int p = 0; p < 3; p++)
+    for (uniform int k = 0; k < 16; k++)
+    {
+        state->block[p * 16 + k] = (state->block[p * 16 + k] / 31) * 64;
+
+        state->rgb_bounds[p  ] = min(state->rgb_bounds[p  ], state->block[p * 16 + k]);
+        state->rgb_bounds[3+p] = max(state->rgb_bounds[3+p], state->block[p * 16 + k]);
+    }
+
+    state->max_span = 0;
+    state->max_span_idx = 0;
+
+    float rgb_span[0] = { 0, 0, 0 };
+    for (uniform int p = 0; p < 3; p++)
+    {
+        rgb_span[p] = state->rgb_bounds[3+p] - state->rgb_bounds[p];
+        if (rgb_span[p] > state->max_span)
+        {
+            state->max_span_idx = p;
+            state->max_span = rgb_span[p];
+        }
+    }
+}
+
+inline void CompressBlockBC6H_core(bc6h_enc_state state[])
+{
+    bc6h_setup(state);
+
+    if (state->slow_mode)
+    {
+        bc6h_test_mode(state, 0, true, 0);
+        bc6h_test_mode(state, 1, true, 0);
+        bc6h_test_mode(state, 2, true, 0);
+        bc6h_test_mode(state, 5, true, 0);
+        bc6h_test_mode(state, 6, true, 0);
+        bc6h_test_mode(state, 9, true, 0);
+        bc6h_test_mode(state, 10, true, 0);
+        bc6h_test_mode(state, 11, true, 0);
+        bc6h_test_mode(state, 12, true, 0);
+        bc6h_test_mode(state, 13, true, 0);
+    }
+    else
+    {        
+        if (state->fastSkipTreshold > 0)
+        {
+            bc6h_test_mode(state, 9, false, 0);
+            if (state->fast_mode) bc6h_test_mode(state, 1, false, 1);
+            bc6h_test_mode(state, 6, false, 1 / 1.2);
+            bc6h_test_mode(state, 5, false, 1 / 1.2);
+            bc6h_test_mode(state, 0, false, 1 / 1.2);
+            bc6h_test_mode(state, 2, false, 1);
+
+            bc6h_enc_2p(state);
+            if (!state->fast_mode) bc6h_test_mode(state, 1, true, 0);
+        }
+
+        bc6h_test_mode(state, 10, false, 0);
+        bc6h_test_mode(state, 11, false, 1);
+        bc6h_test_mode(state, 12, false, 1);
+        bc6h_test_mode(state, 13, false, 1);
+        bc6h_enc_1p(state);
+    } 
+}
+
+void bc6h_enc_copy_settings(bc6h_enc_state state[], uniform bc6h_enc_settings settings[])
+{
+    state->slow_mode = settings->slow_mode;
+    state->fast_mode = settings->fast_mode;
+    state->fastSkipTreshold = settings->fastSkipTreshold;
+    state->refineIterations_1p = settings->refineIterations_1p;
+    state->refineIterations_2p = settings->refineIterations_2p;
+}
+
+inline void CompressBlockBC6H(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[], uniform bc6h_enc_settings settings[])
+{
+    bc6h_enc_state _state;
+    varying bc6h_enc_state* uniform state = &_state;
+
+    bc6h_enc_copy_settings(state, settings);
+    load_block_interleaved_16bit(state->block, src, xx, yy);
+    state->best_err = 1e99;
+
+    CompressBlockBC6H_core(state);
+
+    store_data(dst, src->width, xx, yy, state->best_data, 4);
+}
+
+export void CompressBlocksBC6H_ispc(uniform rgba_surface src[], uniform uint8 dst[], uniform bc6h_enc_settings settings[])
+{
+    for (uniform int yy = 0; yy<src->height / 4; yy++)
+    foreach(xx = 0 ... src->width / 4)
+    {
+        CompressBlockBC6H(src, xx, yy, dst, settings);
+    }
+}
+
+///////////////////////////////////////////////////////////
+//					 ETC encoding
+
+struct etc_enc_settings
+{
+    int fastSkipTreshold;
+};
+
+struct etc_enc_state
+{
+    float block[64];
+    int prev_qcenter[3];
+
+    float best_err;
+    uint32 best_data[2];
+
+    uniform bool diff;
+
+    // settings
+    uniform int fastSkipTreshold;
+};
+
+inline uniform int get_etc1_dY(uniform int table, uniform int q)
+{
+    static uniform const int etc_codeword_table[8][4] =
+    {
+        { -8, -2, 2, 8 },
+        { -17, -5, 5, 17 },
+        { -29, -9, 9, 29 },
+        { -42, -13, 13, 42 },
+        { -60, -18, 18, 60 },
+        { -80, -24, 24, 80 },
+        { -106, -33, 33, 106 },
+        { -183, -47, 47, 183 },
+    };
+
+    return etc_codeword_table[table][q];
+}
+
+uniform int remap_q[] = { 2, 3, 1, 0 };
+
+int get_remap2_q(int x)
+{
+    x -= 2;
+    if (x < 0) x = 1 - x;
+    return x;
+}
+
+int extend_4to8bits(int value)
+{
+    return (value << 4) | value;
+}
+
+int extend_5to8bits(int value)
+{
+    return (value << 3) | (value >> 2);
+}
+
+int quantize_4bits(float value)
+{
+    return clamp((value / 255.0f) * 15 + 0.5, 0, 15);
+}
+
+int quantize_5bits(float value)
+{
+    return clamp((value / 255.0f) * 31 + 0.5, 0, 31);
+}
+
+void center_quant_dequant(int qcenter[3], float center[3], uniform bool diff, int prev_qcenter[3])
+{
+    if (diff)
+    {
+        for (uniform int p = 0; p < 3; p++)
+        {
+            qcenter[p] = quantize_5bits(center[p]);
+
+            if (prev_qcenter[0] >= 0)
+            {
+                if (qcenter[p] - prev_qcenter[p] > 3) qcenter[p] = prev_qcenter[p] + 3;
+                if (qcenter[p] - prev_qcenter[p] < -4) qcenter[p] = prev_qcenter[p] - 4;
+            }
+
+            center[p] = extend_5to8bits(qcenter[p]);
+        }
+    }
+    else
+    {
+        for (uniform int p = 0; p < 3; p++)
+        {
+            qcenter[p] = quantize_4bits(center[p]);
+            center[p] = extend_4to8bits(qcenter[p]);
+        }
+    }
+}
+
+float quantize_pixels_etc1_half(uint32 qblock[1], float block[48], float center[3], uniform int table)
+{
+    float total_err = 0;
+    uint32 bits = 0;
+
+    for (uniform int y = 0; y < 2; y++)
+    for (uniform int x = 0; x < 4; x++)
+    {
+        float best_err = sq(255) * 3;
+        int best_q = -1;
+
+        for (uniform int q = 0; q < 4; q++)
+        {
+            int dY = get_etc1_dY(table, remap_q[q]);
+
+            float err = 0;
+            for (int p = 0; p < 3; p++)
+                err += sq(block[16 * p + y*4+x] - clamp(center[p] + dY, 0, 255));
+
+            if (err < best_err)
+            {
+                best_err = err;
+                best_q = q;
+            }
+        }
+
+        assert(best_q >= 0);
+
+        bits |= (best_q  & 1) << (x * 4 + y);
+        bits |= (best_q >> 1) << (x * 4 + y + 16);
+        total_err += best_err;
+    }
+
+    qblock[0] = bits;
+    return total_err;
+}
+
+float compress_etc1_half_1(uint32 out_qbits[1], int out_table[1], int out_qcenter[3], 
+                           float half_pixels[], uniform bool diff, int prev_qcenter[3])
+{
+    float dc[3];
+
+    for (uniform int p = 0; p<3; p++) dc[p] = 0;
+
+    for (uniform int k = 0; k<8; k++)
+    {
+        for (uniform int p = 0; p<3; p++)
+            dc[p] += half_pixels[k + p * 16];
+    }
+
+    float best_error = sq(255) * 3 * 8.0f;
+    int best_table = -1;
+    int best_qcenter[3];
+    uint32 best_qbits;
+
+    for (uniform int table_level = 0; table_level < 8; table_level++)
+    {
+        float center[3];
+        int qcenter[3];
+        uint32 qbits;
+
+        for (uniform int p = 0; p < 3; p++) center[p] = dc[p] / 8 - get_etc1_dY(table_level, 2);
+        center_quant_dequant(qcenter, center, diff, prev_qcenter);
+
+        float err = quantize_pixels_etc1_half(&qbits, half_pixels, center, table_level);
+
+        if (err < best_error)
+        {
+            best_error = err;
+            best_table = table_level;
+            best_qbits = qbits;
+            for (uniform int p = 0; p < 3; p++) best_qcenter[p] = qcenter[p];
+        }
+    }
+    
+    out_table[0] = best_table;
+    out_qbits[0] = best_qbits;
+    for (uniform int p = 0; p < 3; p++) out_qcenter[p] = best_qcenter[p];
+    return best_error;
+}
+
+float optimize_center(float colors[4][10], uniform int p, uniform int table_level)
+{
+    float best_center = 0;
+    for (uniform int q = 0; q < 4; q++)
+    {
+        best_center += (colors[q][7 + p] - get_etc1_dY(table_level, q)) * colors[q][3];
+    }
+    best_center /= 8;
+
+    float best_err = 0;
+    for (uniform int q = 0; q < 4; q++)
+    {
+        float dY = get_etc1_dY(table_level, q);
+        best_err += sq(clamp(best_center + dY, 0, 255) - colors[q][7 + p]) * colors[q][3];
+    }
+
+    for (uniform int branch = 0; branch < 4; branch++)
+    {
+        float new_center = 0;
+        float sum = 0;
+        for (uniform int q = 0; q < 4; q++)
+        {
+            if (branch <= 1 && q <= branch) continue;
+            if (branch >= 2 && q >= branch) continue;
+            new_center += (colors[q][7 + p] - get_etc1_dY(table_level, q)) * colors[q][3];
+            sum += colors[q][3];
+        }
+
+        new_center /= sum;
+
+        float err = 0;
+        for (uniform int q = 0; q < 4; q++)
+        {
+            float dY = get_etc1_dY(table_level, q);
+            err += sq(clamp(new_center + dY, 0, 255) - colors[q][7 + p]) * colors[q][3];
+        }
+
+        if (err < best_err)
+        {
+            best_err = err;
+            best_center = new_center;
+        }
+    }
+
+    return best_center;
+}
+
+float compress_etc1_half_7(uint32 out_qbits[1], int out_table[1], int out_qcenter[3],
+                           float half_pixels[], etc_enc_state state[])
+{
+    int err_list[165];
+    int y_sorted_inv[8];
+    float y_sorted[8];
+
+    {
+        int y_sorted_idx[8];
+        for (uniform int k = 0; k < 8; k++)
+        {
+            float value = 0;
+            for (uniform int p = 0; p < 3; p++)
+                value += half_pixels[k + p * 16];
+
+            y_sorted_idx[k] = (((int)value) << 4) + k;
+        }
+
+        partial_sort_list(y_sorted_idx, 8, 8);
+
+        for (uniform int k = 0; k < 8; k++)
+            y_sorted_inv[k] = ((y_sorted_idx[k] & 0xF) << 4) + k;
+
+        for (uniform int k = 0; k < 8; k++)
+            y_sorted[k] = (y_sorted_idx[k] >> 4) / 3.0f;
+
+        partial_sort_list(y_sorted_inv, 8, 8);
+    }
+
+    uniform int idx = -1;
+    for (uniform int level1 = 0; level1 <= 8; level1++)
+    for (uniform int level2 = level1; level2 <= 8; level2++)
+    for (uniform int level3 = level2; level3 <= 8; level3++)
+    {
+        idx++;
+        assert(idx < 165);
+        
+        float sum[4];
+        float sum_sq[4];
+        float count[4];
+        float inv_count[4];
+
+        for (uniform int q = 0; q < 4; q++)
+        {
+            sum[q] = 0;
+            sum_sq[q] = 0;
+            count[q] = 0;
+            inv_count[q] = 0;
+        }
+
+        for (uniform int k = 0; k < 8; k++)
+        {
+            uniform int q = 0;
+            if (k >= level1) q = 1;
+            if (k >= level2) q = 2;
+            if (k >= level3) q = 3;
+
+            sum[q] += y_sorted[k];
+            sum_sq[q] += sq(y_sorted[k]);
+            count[q] += 1;
+        }
+
+        for (uniform int q = 0; q < 4; q++)
+        {
+            if (count[q] > 0) inv_count[q] = 1 / count[q];
+        }
+
+        float base_err = 0;
+        for (uniform int q = 0; q < 4; q++) base_err += sum_sq[q] - sq(sum[q]) * inv_count[q];
+
+        float t_err = sq(256) * 8;        
+        for (uniform int table_level = 0; table_level < 8; table_level++)
+        {
+            float center = 0;
+            for (uniform int q = 0; q < 4; q++) center += sum[q] - get_etc1_dY(table_level, q) * count[q];
+            center /= 8;
+
+            float err = base_err;
+            for (uniform int q = 0; q < 4; q++)
+            {
+                err += sq(center + get_etc1_dY(table_level, q) - sum[q] * inv_count[q])*count[q];
+            }
+
+            t_err = min(t_err, err);
+        }
+
+        int packed = (level1 * 16 + level2) * 16 + level3;
+
+        err_list[idx] = (((int)t_err) << 12) + packed;
+    }
+
+    partial_sort_list(err_list, 165, state->fastSkipTreshold);
+
+    float best_error = sq(255) * 3 * 8.0f;
+    int best_table = -1;
+    int best_qcenter[3];
+    uint32 best_qbits;
+
+    for (uniform int i = 0; i < state->fastSkipTreshold; i++)
+    {
+        int packed = err_list[i] & 0xFFF;
+        int level1 = (packed >> 8) & 0xF;
+        int level2 = (packed >> 4) & 0xF;
+        int level3 = (packed >> 0) & 0xF;
+                
+        float colors[4][10];
+
+        for (uniform int p = 0; p < 7; p++)
+        for (uniform int q = 0; q < 4; q++) colors[q][p] = 0;
+
+        uint32 qbits = 0;
+        for (uniform int kk = 0; kk < 8; kk++)
+        {
+            int k = y_sorted_inv[kk] & 0xF;
+
+            int qq = 0;
+            if (k >= level1) qq = 1;
+            if (k >= level2) qq = 2;
+            if (k >= level3) qq = 3;
+
+            uniform int xx = kk & 3;
+            uniform int yy = kk >> 2;
+
+            int qqq = get_remap2_q(qq);
+            qbits |= (qqq & 1) << (yy + xx * 4);
+            qbits |= (qqq >> 1) << (16 + yy + xx * 4);
+
+            float qvec[4];
+            for (uniform int q = 0; q < 4; q++)
+            {
+                qvec[q] = q == qq ? 1.0 : 0.0;
+                colors[q][3] += qvec[q];
+            }
+
+            for (uniform int p = 0; p < 3; p++)
+            {
+                float value = half_pixels[16 * p + kk];
+                for (uniform int q = 0; q < 4; q++)
+                {
+                    colors[q][p] += value * qvec[q];
+                    colors[q][4 + p] += sq(value) * qvec[q];
+                }
+            }
+        }
+        
+        float base_err = 0;
+        for (uniform int q = 0; q < 4; q++)
+        {
+            if (colors[q][3] > 0)
+            for (uniform int p = 0; p < 3; p++)
+            {
+                colors[q][7 + p] = colors[q][p] / colors[q][3];
+                base_err += colors[q][4 + p] - sq(colors[q][7 + p])*colors[q][3];
+            }
+        }
+
+        for (uniform int table_level = 0; table_level < 8; table_level++)
+        {
+            float center[3];
+            int qcenter[3];
+            
+            for (uniform int p = 0; p < 3; p++)
+            {
+                center[p] = optimize_center(colors, p, table_level);
+            }
+            
+            center_quant_dequant(qcenter, center, state->diff, state->prev_qcenter);
+            
+            float err = base_err;
+            for (uniform int q = 0; q < 4; q++)
+            {
+                int dY = get_etc1_dY(table_level, q);
+                for (uniform int p = 0; p < 3; p++)
+                    err += sq(clamp(center[p] + dY, 0, 255) - colors[q][7 + p])*colors[q][3];
+            }
+            
+            if (err < best_error)
+            {
+                best_error = err;
+                best_table = table_level;
+                best_qbits = qbits;
+                for (uniform int p = 0; p < 3; p++) best_qcenter[p] = qcenter[p];
+            }
+        }
+    }
+
+    out_table[0] = best_table;
+    out_qbits[0] = best_qbits;
+    for (uniform int p = 0; p < 3; p++) out_qcenter[p] = best_qcenter[p];
+    return best_error;
+}
+
+float compress_etc1_half(uint32 qbits[1], int table[1], int qcenter[3], float half_pixels[], etc_enc_state state[])
+{
+    float err = compress_etc1_half_7(qbits, table, qcenter, half_pixels, state);
+
+    for (uniform int p = 0; p < 3; p++)
+        state->prev_qcenter[p] = qcenter[p];
+
+    return err;
+}
+
+//////////////////////////
+//       ETC1 core
+
+inline uint32 bswap32(uint32 v)
+{
+    uint32 r = 0;
+    r += ((v >> 24) & 255) << 0;
+    r += ((v >> 16) & 255) << 8;
+    r += ((v >> 8) & 255) << 16;
+    r += ((v >> 0) & 255) << 24;
+    return r;
+}
+
+void etc_pack(uint32 data[], uint32 qbits[2], int tables[2], int qcenters[2][3], uniform int diff, uniform int flip)
+{
+    for (uniform int k = 0; k < 2; k++) data[k] = 0;
+    uniform int pos = 0;
+
+    if (diff == 0)
+    {
+        put_bits(data, &pos, 4, qcenters[1][0]);
+        put_bits(data, &pos, 4, qcenters[0][0]);
+
+        put_bits(data, &pos, 4, qcenters[1][1]);
+        put_bits(data, &pos, 4, qcenters[0][1]);
+
+        put_bits(data, &pos, 4, qcenters[1][2]);
+        put_bits(data, &pos, 4, qcenters[0][2]);
+    }
+    else
+    {
+        put_bits(data, &pos, 3, (qcenters[1][0] - qcenters[0][0]) & 7);
+        put_bits(data, &pos, 5, qcenters[0][0]);
+
+        put_bits(data, &pos, 3, (qcenters[1][1] - qcenters[0][1]) & 7);
+        put_bits(data, &pos, 5, qcenters[0][1]);
+
+        put_bits(data, &pos, 3, (qcenters[1][2] - qcenters[0][2]) & 7);
+        put_bits(data, &pos, 5, qcenters[0][2]);
+    }
+
+    put_bits(data, &pos, 1, flip);
+    put_bits(data, &pos, 1, diff);
+    put_bits(data, &pos, 3, tables[1]);
+    put_bits(data, &pos, 3, tables[0]);
+
+    uint32 all_qbits_flipped = (qbits[1] << 2) | qbits[0];
+    uint32 all_qbits = 0;
+
+    if (flip != 0) all_qbits = all_qbits_flipped;
+
+    if (flip == 0)
+    for (uniform int k = 0; k < 2; k++)
+    for (uniform int y = 0; y < 4; y++)
+    for (uniform int x = 0; x < 4; x++)
+    {
+        int bit = (all_qbits_flipped >> (k * 16 + x * 4 + y)) & 1;
+        all_qbits += bit << (k * 16 + y * 4 + x);
+    }
+
+    data[1] = bswap32(all_qbits);
+}
+
+inline void CompressBlockETC1_core(etc_enc_state state[])
+{
+    float flipped_block[48];
+
+    for (uniform int y = 0; y < 4; y++)
+    for (uniform int x = 0; x < 4; x++)
+    for (uniform int p = 0; p < 3; p++)
+    {
+        flipped_block[16 * p + x * 4 + y] = state->block[16 * p + y * 4 + x];
+    }
+
+    for (uniform int flip = 0; flip < 2; flip++)
+    for (uniform int diff = 1; diff >= 0; diff--)
+    {
+        state->diff = diff == 1;
+        state->prev_qcenter[0] = -1;
+
+        varying float * uniform pixels = state->block;
+        if (flip == 0) pixels = flipped_block;
+
+        uint32 qbits[2];
+        int tables[2];
+        int qcenters[2][3];
+
+        float err = 0;
+        err += compress_etc1_half(&qbits[0], &tables[0], qcenters[0], &pixels[0], state);
+        err += compress_etc1_half(&qbits[1], &tables[1], qcenters[1], &pixels[8], state);
+
+        if (err < state->best_err)
+        {
+            state->best_err = err;
+            etc_pack(state->best_data, qbits, tables, qcenters, diff, flip);
+        }
+    }
+}
+
+void etc_enc_copy_settings(etc_enc_state state[], uniform etc_enc_settings settings[])
+{
+    state->fastSkipTreshold = settings->fastSkipTreshold;
+}
+
+inline void CompressBlockETC1(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[], uniform etc_enc_settings settings[])
+{
+    etc_enc_state _state;
+    varying etc_enc_state* uniform state = &_state;
+
+    etc_enc_copy_settings(state, settings);
+    load_block_interleaved(state->block, src, xx, yy);
+    state->best_err = 1e99;
+
+    CompressBlockETC1_core(state);
+
+    store_data(dst, src->width, xx, yy, state->best_data, 2);
+}
+
+export void CompressBlocksETC1_ispc(uniform rgba_surface src[], uniform uint8 dst[], uniform etc_enc_settings settings[])
+{
+    for (uniform int yy = 0; yy<src->height / 4; yy++)
+    foreach(xx = 0 ... src->width / 4)
+    {
+        CompressBlockETC1(src, xx, yy, dst, settings);
+    }
+}
+
+export uniform int ISPCIsa_ispc()
+{
+#if defined(ISPC_TARGET_SSE2)
+    return 0;
+#elif defined(ISPC_TARGET_SSE4)
+    return 1;
+#elif defined(ISPC_TARGET_AVX2)
+    return 2;
+#else
+    return -1;
+#endif 
+}
\ No newline at end of file
diff --git a/Source/ispc_texcomp/kernel_astc.ispc b/Source/ispc_texcomp/kernel_astc.ispc
new file mode 100644
index 0000000..98f3e5f
--- /dev/null
+++ b/Source/ispc_texcomp/kernel_astc.ispc
@@ -0,0 +1,2272 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016, Intel Corporation
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 
+// documentation files (the "Software"), to deal in the Software without restriction, including without limitation 
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of 
+// the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+// SOFTWARE.
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef int8 int8_t;
+typedef int32 int32_t;
+typedef int64 int64_t;
+
+typedef unsigned int8 uint8_t;
+typedef unsigned int32 uint32_t;
+typedef unsigned int64 uint64_t;
+
+///////////////////////////
+//   generic helpers
+
+inline float RCP(float x)
+{
+    return 1.0f/x; // uses rcp when compiled with --opt=fast-math
+    //return rcp(x);
+    //return rcp_fast(x);
+}
+
+inline float RSQRT(float x)
+{
+    return 1.0f/sqrt(x); // uses rsqrt when compiled with --opt=fast-math
+    //return rsqrt(x);
+    //return rsqrt_fast(x);
+}
+
+void swap(float& a, float& b)
+{
+    int t = a;
+    a = b; b = t;
+}
+
+void swap(int& a, int& b)
+{
+    int t = a;
+    a = b; b = t;
+}
+
+void swap(uint32_t& a, uint32_t& b)
+{
+    uint32_t t = a;
+    a = b; b = t;
+}
+
+void swap(uint8_t& a, uint8_t& b)
+{
+    uint8_t t = a;
+    a = b; b = t;
+}
+
+inline float sq(float v)
+{
+    return v*v;
+}
+
+inline float clamp(float v, int a, int b)
+{
+    return clamp(v, (float)a, (float)b);
+}
+
+inline float dot3(float a[3], float b[3])
+{
+    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+
+inline float dot4(float a[4], float b[4])
+{
+    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
+}
+
+// the following helpers isolate performance warnings
+
+inline uint32_t gather_uint(const uniform uint32_t* const uniform ptr, int idx)
+{
+    return ptr[idx]; // (perf warning expected)
+}
+
+inline float gather_float(uniform float* uniform ptr, int idx)
+{
+    return ptr[idx]; // (perf warning expected)
+}
+
+inline float gather_float(varying float* uniform ptr, int idx)
+{
+    return ptr[idx]; // (perf warning expected)
+}
+
+inline void scatter_uint(uniform uint32_t* ptr, int idx, uint32_t value)
+{
+    ptr[idx] = value; // (perf warning expected)
+}
+
+inline void scatter_float(uniform float* uniform ptr, int idx, float value)
+{
+    ptr[idx] = value; // (perf warning expected)
+}
+
+inline void scatter_float(varying float* uniform ptr, int idx, float value)
+{
+    ptr[idx] = value; // (perf warning expected)
+}
+
+///////////////////////////////////////////////////////////
+//				 ASTC shared functions
+
+struct rgba_surface
+{
+    uint8_t* ptr;
+    int width, height, stride;
+};
+
+inline void set_pixel(float pixels[], uniform int p, uniform int x, uniform int y, float value);
+
+inline void load_block_interleaved(float pixels[], uniform rgba_surface src[], int xx, int yy, uniform int width, uniform int height)
+{
+    uniform int pitch = width * height;
+    for (uniform int y = 0; y < height; y++)
+    for (uniform int x = 0; x < width; x++)
+    {
+        uint32_t rgba = gather_uint((uint32_t*)src->ptr, ((yy * height + y)*src->stride + (xx * width + x) * 4)/4);
+
+        set_pixel(pixels, 0, x, y, (int)((rgba >> 0) & 255));
+        set_pixel(pixels, 1, x, y, (int)((rgba >> 8) & 255));
+        set_pixel(pixels, 2, x, y, (int)((rgba >> 16) & 255));
+        set_pixel(pixels, 3, x, y, (int)((rgba >> 24) & 255));
+    }
+}
+
+struct astc_enc_settings
+{
+    int block_width;
+    int block_height;
+    int channels;
+
+    int fastSkipTreshold;
+    int refineIterations;
+};
+
+export uniform int get_programCount()
+{
+    return programCount;
+} 
+
+inline float get_pixel(float pixels[], uniform int p, uniform int x, uniform int y)
+{
+    uniform static const int ystride = 8;
+    uniform static const int pstride = 64;
+
+    return pixels[pstride * p + ystride * y + x];
+}
+
+inline void set_pixel(float pixels[], uniform int p, uniform int x, uniform int y, float value)
+{
+    uniform static const int ystride = 8;
+    uniform static const int pstride = 64;
+
+    pixels[pstride * p + ystride * y + x] = value;
+}
+
+struct pixel_set
+{
+    varying float* uniform pixels;
+
+    uniform int width;
+    uniform int height;
+};
+
+inline void clear_alpha(float pixels[], uniform int width, uniform int height)
+{
+    for (uniform int y = 0; y < height; y++)
+    for (uniform int x = 0; x < width; x++)
+    {
+        set_pixel(pixels, 3, x, y, 255);
+    }
+}
+
+void rotate_plane(pixel_set block[], int p)
+{
+    uniform int pitch = block->height * block->width;
+
+    for (uniform int y = 0; y < block->height; y++)
+    for (uniform int x = 0; x < block->width; x++)
+    {
+        float r = get_pixel(block->pixels, 0, x, y);
+        float g = get_pixel(block->pixels, 1, x, y);
+        float b = get_pixel(block->pixels, 2, x, y);
+        float a = get_pixel(block->pixels, 3, x, y);
+        
+        if (p == 0) swap(a, r);
+        if (p == 1) swap(a, g);
+        if (p == 2) swap(a, b);
+
+        set_pixel(block->pixels, 0, x, y, r);
+        set_pixel(block->pixels, 1, x, y, g);
+        set_pixel(block->pixels, 2, x, y, b);
+        set_pixel(block->pixels, 3, x, y, a);        
+    }
+}
+
+inline void compute_moments(float stats[15], pixel_set block[], uniform int channels)
+{
+    for (uniform int y = 0; y < block->height; y++)
+    for (uniform int x = 0; x < block->width; x++)
+    {
+        float rgba[4];
+        for (uniform int p = 0; p < channels; p++) rgba[p] = get_pixel(block->pixels, p, x, y);
+
+        stats[10] += rgba[0];
+        stats[11] += rgba[1];
+        stats[12] += rgba[2];
+
+        stats[0] += rgba[0] * rgba[0];
+        stats[1] += rgba[0] * rgba[1];
+        stats[2] += rgba[0] * rgba[2];
+
+        stats[4] += rgba[1] * rgba[1];
+        stats[5] += rgba[1] * rgba[2];
+
+        stats[7] += rgba[2] * rgba[2];
+
+        if (channels == 4)
+        {
+            stats[13] += rgba[3];
+
+            stats[3] += rgba[0] * rgba[3];
+            stats[6] += rgba[1] * rgba[3];
+            stats[8] += rgba[2] * rgba[3];
+            stats[9] += rgba[3] * rgba[3];
+        }
+    }
+
+    stats[14] += block->height * block->width;
+}
+
+inline void covar_from_stats(float covar[10], float stats[15], uniform int channels)
+{
+    covar[0] = stats[0] - stats[10 + 0] * stats[10 + 0] / stats[14];
+    covar[1] = stats[1] - stats[10 + 0] * stats[10 + 1] / stats[14];
+    covar[2] = stats[2] - stats[10 + 0] * stats[10 + 2] / stats[14];
+
+    covar[4] = stats[4] - stats[10 + 1] * stats[10 + 1] / stats[14];
+    covar[5] = stats[5] - stats[10 + 1] * stats[10 + 2] / stats[14];
+
+    covar[7] = stats[7] - stats[10 + 2] * stats[10 + 2] / stats[14];
+
+    if (channels == 4)
+    {
+        covar[3] = stats[3] - stats[10 + 0] * stats[10 + 3] / stats[14];
+        covar[6] = stats[6] - stats[10 + 1] * stats[10 + 3] / stats[14];
+        covar[8] = stats[8] - stats[10 + 2] * stats[10 + 3] / stats[14];
+        covar[9] = stats[9] - stats[10 + 3] * stats[10 + 3] / stats[14];
+    }
+}
+
+inline void compute_covar_dc(float covar[], float dc[], pixel_set block[], bool zero_based, uniform int channels)
+{
+    float stats[15] = { 0 };
+    compute_moments(stats, block, channels);
+
+    if (zero_based)
+    for (uniform int p = 0; p < 4; p++) stats[10 + p] = 0;
+
+    covar_from_stats(covar, stats, channels);
+    for (uniform int p = 0; p < channels; p++) dc[p] = stats[10 + p] / stats[14];
+}
+
+inline void ssymv3(float a[4], float covar[10], float b[4])
+{
+    a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2];
+    a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2];
+    a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2];
+}
+
+inline void ssymv4(float a[4], float covar[10], float b[4])
+{
+    a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2] + covar[3] * b[3];
+    a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2] + covar[6] * b[3];
+    a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2] + covar[8] * b[3];
+    a[3] = covar[3] * b[0] + covar[6] * b[1] + covar[8] * b[2] + covar[9] * b[3];
+}
+
+inline void compute_axis(float axis[4], float covar[10], uniform const int powerIterations, uniform int channels)
+{
+    float vec[4] = { 1, 1, 1, 1 };
+
+    for (uniform int i = 0; i < powerIterations; i++)
+    {
+        if (channels == 3) ssymv3(axis, covar, vec);
+        if (channels == 4) ssymv4(axis, covar, vec);
+        for (uniform int p = 0; p < channels; p++) vec[p] = axis[p];
+
+        if (i % 2 == 1) // renormalize every other iteration
+        {
+            float norm_sq = 0;
+            for (uniform int p = 0; p < channels; p++)
+                norm_sq += axis[p] * axis[p];
+
+            float rnorm = RSQRT(norm_sq);
+            for (uniform int p = 0; p < channels; p++) vec[p] *= rnorm;
+        }
+    }
+
+    for (uniform int p = 0; p < channels; p++) axis[p] = vec[p];
+}
+
+void compute_pca_endpoints(float ep[8], pixel_set block[], bool zero_based, uniform int channels)
+{
+    float dc[4];
+    float cov[10];
+    compute_covar_dc(cov, dc, block, zero_based, channels);
+
+    uniform int powerIterations = 10;
+
+    float eps = sq(0.001) * 1000;
+    cov[0] += eps;
+    cov[4] += eps;
+    cov[7] += eps;
+    cov[9] += eps;
+
+    float dir[4];
+    compute_axis(dir, cov, powerIterations, channels);
+
+    float ext[2] = { 1000, -1000 };
+
+    for (uniform int y = 0; y < block->height; y++)
+    for (uniform int x = 0; x < block->width; x++)
+    {
+        float proj = 0;
+        for (uniform int p = 0; p < channels; p++) proj += (get_pixel(block->pixels, p, x, y) - dc[p]) * dir[p];
+
+        ext[0] = min(ext[0], proj);
+        ext[1] = max(ext[1], proj);
+    }
+
+    if (ext[1] - 1.0f < ext[0])
+    {
+        ext[1] += 0.5f;
+        ext[0] -= 0.5f;
+    }
+    
+    for (uniform int i = 0; i < 2; i++)
+    for (uniform int p = 0; p < channels; p++)
+    {
+        ep[p * 2 + i] = dc[p] + dir[p] * ext[i];
+    }
+}
+
+uniform static const int range_table[][3] =
+{
+    //2^ 3^ 5^
+    { 1, 0, 0 }, // 0..1
+    { 0, 1, 0 }, // 0..2
+    { 2, 0, 0 }, // 0..3
+
+    { 0, 0, 1 }, // 0..4
+    { 1, 1, 0 }, // 0..5
+    { 3, 0, 0 }, // 0..7
+
+    { 1, 0, 1 }, // 0..9
+    { 2, 1, 0 }, // 0..11
+    { 4, 0, 0 }, // 0..15
+
+    { 2, 0, 1 }, // 0..19
+    { 3, 1, 0 }, // 0..23
+    { 5, 0, 0 }, // 0..31
+
+    { 3, 0, 1 }, // 0..39
+    { 4, 1, 0 }, // 0..47
+    { 6, 0, 0 }, // 0..63
+
+    { 4, 0, 1 }, // 0..79
+    { 5, 1, 0 }, // 0..95
+    { 7, 0, 0 }, // 0..127
+
+    { 5, 0, 1 }, // 0..159
+    { 6, 1, 0 }, // 0..191
+    { 8, 0, 0 }, // 0..255
+};
+
+uniform int get_levels(uniform int range)
+{
+    return (1 + 2 * range_table[range][1] + 4 * range_table[range][2]) << range_table[range][0];
+}
+
+struct range_values
+{
+    int levels_m;
+    int levels_m_rcp;
+    int levels_e;
+    int levels;
+};
+
+void fill_range_values(range_values values[], int _range[])
+{
+    int range = *_range;
+    int range_div3 = (range * 21846) >> 16;
+    int range_mod3 = range - range_div3 * 3;
+
+    int levels_m = max(2, 5 - range_mod3 * 2);
+    int levels_e = max(0, range_mod3 + range_div3 - 1);
+    if (range == 0) levels_m = 2;
+
+    int levels_m_rcp = 0x10000 / 2 + 1;
+    if (levels_m == 3) levels_m_rcp = 0x10000 / 3 + 1;
+    if (levels_m == 5) levels_m_rcp = 0x10000 / 5 + 1;
+
+    values->levels_e = levels_e;
+    values->levels_m = levels_m;
+    values->levels_m_rcp = levels_m_rcp;
+
+    values->levels = levels_m << levels_e;
+}
+
+range_values get_range_values(int range)
+{
+    range_values values;
+    fill_range_values(&values, &range);
+    return values;
+}
+
+int get_levels(int range)
+{
+    int range_div3 = (range * 21846) >> 16;
+    int range_mod3 = range - range_div3 * 3;
+
+    int levels_m = max(2, 5 - range_mod3 * 2);
+    int levels_e = range_mod3 + range_div3 - 1;
+
+    return (levels_m << (levels_e + 1)) >> 1;
+}
+
+uniform float get_sq_rcp_levels(uniform int range)
+{
+    uniform static const float table[] =
+    {
+        1.000000, 0.250000, 0.111111,
+        0.062500, 0.040000, 0.020408,
+        0.012346, 0.008264, 0.004444,
+        0.002770, 0.001890, 0.001041,
+        0.000657, 0.000453, 0.000252,
+        0.000160, 0.000111, 0.000062,
+        0.000040, 0.000027, 0.000015,
+    };
+
+    return table[range];
+}
+
+///////////////////////////////////////////////////////////
+//				 ASTC candidate ranking
+
+struct astc_rank_state
+{
+    float pixels[256];
+
+    float pca_error[2][5];
+    float alpha_error[2][5];
+    float sq_norm[2][5];
+    float scale_error[7][7]; // 2x2 to 8x8
+
+    float best_scores[64];
+    uint32_t best_modes[64];
+
+    // settings
+    uniform int block_width;
+    uniform int block_height;
+    uniform int pitch;
+
+    uniform int fastSkipTreshold;
+};
+
+struct astc_mode
+{
+    int width;
+    int height;
+    bool dual_plane;
+    int weight_range;
+    int color_component_selector;
+    int partitions;
+    int partition_id;
+    int color_endpoint_pairs;
+    int color_endpoint_modes[2];
+    int endpoint_range;
+};
+
+void dct_4(float values[], uniform int stride)
+{
+    uniform static const float scale[] = { 0.5, 0.707106769 };
+    uniform static const float c[5] = { 1, 0.923879533, 0.707106769, 0.382683432, 0 };
+        
+    float data[4];
+    for (uniform int i = 0; i < 2; i++)
+    {
+        float a = values[stride * i];
+        float b = values[stride * (3 - i)];
+        data[0 + i] = a + b;
+        data[2 + i] = a - b;
+    }
+
+    for (uniform int i = 0; i < 4; i++)
+    {
+        float acc = 0;
+        varying float* uniform input = &data[(i % 2) * 2];
+        for (uniform int j = 0; j < 2; j++)
+        {
+            uniform int e = (2 * j + 1)*i;
+            e = e % (4 * 4);
+            uniform float w = 1;
+            if (e>8) { e = 16 - e; }
+            if (e>4) { w = -1;  e = 8 - e; }
+            w *= c[e];
+            acc += w * input[j];
+        }
+
+        values[stride * i] = acc * scale[i > 0];
+    }
+}
+
+void dct_6(float values[], uniform int stride)
+{
+    uniform static const float scale[] = { 0.408248290, 0.577350269 };
+    uniform static const float c[7] =
+        { 1, 0.965925813, 0.866025388, 0.707106769, 0.500000000, 0.258819044, 0 };
+    
+    float data[6];
+    for (uniform int i = 0; i < 3; i++)
+    {
+        float a = values[stride * i];
+        float b = values[stride * (5 - i)];
+        data[0 + i] = a + b;
+        data[3 + i] = a - b;
+    }
+
+    for (uniform int i = 0; i < 6; i++)
+    {
+        float acc = 0;
+        varying float* uniform input = &data[(i % 2) * 3];
+        for (uniform int j = 0; j < 3; j++)
+        {
+            uniform int e = (2 * j + 1)*i;
+            e = e % (4 * 6);
+            uniform float w = 1;
+            if (e>12) { e = 24 - e; }
+            if (e>6) { w = -1;  e = 12 - e; }
+            w *= c[e];
+            acc += w * input[j];
+        }
+
+        values[stride * i] = acc * scale[i > 0];
+    }
+}
+
+void dct_n(float values[], uniform int stride, uniform int n)
+{
+    uniform static const float pi = 3.14159265358979323846;
+
+    assert(n <= 16);
+    uniform float c[16 + 1];
+    for (uniform int i = 0; i <= n; i++)
+        c[i] = cos((i / (4.0 * n) * 2 * pi));
+
+    uniform float scale[] = { 1 / sqrt(1.0*n), 1 / sqrt(n / 2.0), };
+
+    float data[16];
+    for (uniform int i = 0; i < n; i++)
+        data[i] = values[stride * i];
+
+    for (uniform int i = 0; i < n; i++)
+    {
+        float acc = 0;
+        for (uniform int j = 0; j < n; j++)
+        {
+            uniform int e = (2 * j + 1)*i;
+            e = e % (4 * n);
+            float w = 1;
+            if (e > 2 * n) { e = 4 * n - e; }
+            if (e > n) { w = -1;  e = 2 * n - e; }
+            assert(e <= n);
+            w *= c[e];
+            acc += w * data[j];
+        }
+
+        values[stride * i] = acc * scale[i > 0];
+    }
+}
+
+void dct(float values[], uniform int stride, uniform int n)
+{
+    if (false) {}
+    else if (n == 8) dct_n(values, stride, 8);
+    else if (n == 6) dct_6(values, stride);
+    else if (n == 5) dct_n(values, stride, 5);
+    else if (n == 4) dct_4(values, stride);
+    else
+    {
+        assert(false);
+    }
+}
+
+void compute_dct_inplace(pixel_set block[], uniform int channels)
+{
+    uniform static const int stride = 8;
+    uniform static const int pitch = 64;
+
+    for (uniform int p = 0; p < channels; p++)
+    {
+        for (uniform int y = 0; y < block->height; y++)
+            dct(&block->pixels[pitch * p + y * stride], 1, block->width);
+
+        for (uniform int x = 0; x < block->width; x++)
+            dct(&block->pixels[pitch * p + x], stride, block->height);
+    }
+}
+
+void compute_metrics(astc_rank_state state[])
+{
+    float temp_pixels[256];
+    pixel_set _pset; varying pixel_set* uniform pset = &_pset;
+    pset->pixels = temp_pixels;
+    pset->width = state->block_width;
+    pset->height = state->block_height;
+
+    for (uniform int p = 0; p < 4; p++)
+    for (uniform int y = 0; y < state->block_height; y++)
+    for (uniform int x = 0; x < state->block_width; x++)
+    {
+        float value = get_pixel(state->pixels, p, x, y);
+        set_pixel(pset->pixels, p, x, y, value);
+    }
+
+    for (uniform int i = 0; i < 2; i++)
+    {
+        bool zero_based = (i == 1);
+        float endpoints[8];
+        compute_pca_endpoints(endpoints, pset, zero_based, 4);
+
+        float base[4], dir[4];
+        for (int p = 0; p < 4; p++) dir[p] = endpoints[p * 2 + 1] - endpoints[p * 2];
+        for (int p = 0; p < 4; p++) base[p] = endpoints[p * 2];
+        float sq_norm = dot4(dir, dir) + 0.00001;
+
+        float pca_error = 0;
+        float alpha_error = 0;
+        float pca_alpha_error = 0;
+        for (uniform int y = 0; y < state->block_height; y++)
+        for (uniform int x = 0; x < state->block_width; x++)
+        {
+            float pixel[4];
+            for (uniform int p = 0; p < 4; p++) pixel[p] = get_pixel(pset->pixels, p, x, y) - base[p];
+            float proj = dot4(pixel, dir) / sq_norm;
+            for (uniform int p = 0; p < 3; p++) pca_error += sq(get_pixel(pset->pixels, p, x, y) - (proj * dir[p] + base[p]));
+            pca_alpha_error += sq(get_pixel(pset->pixels, 3, x, y) - (proj * dir[3] + base[3]));
+            alpha_error += sq(get_pixel(pset->pixels, 3, x, y) - 255);
+        }
+
+        state->pca_error[i][0] = pca_error + pca_alpha_error;
+        state->alpha_error[i][0] = alpha_error - pca_alpha_error;
+        state->sq_norm[i][0] = sq_norm;
+    }
+
+    for (uniform int i = 0; i < 2; i++)
+    for (uniform int c = 1; c < 5; c++)
+    {
+        rotate_plane(pset, c - 1);
+        
+        bool zero_based = (i == 1);
+        float endpoints[8];
+        compute_pca_endpoints(endpoints, pset, zero_based, 3);
+
+        float base[3], dir[3];
+        for (int p = 0; p < 3; p++) dir[p] = endpoints[p * 2 + 1] - endpoints[p * 2];
+        for (int p = 0; p < 3; p++) base[p] = endpoints[p * 2];
+        float sq_norm = dot3(dir, dir) + 0.00001;
+
+        float pca_error = 0;
+        float alpha_error = 0;
+        float pca_alpha_error = 0;
+        float ext[2] = { 1000, -1000 };
+        for (uniform int y = 0; y < state->block_height; y++)
+        for (uniform int x = 0; x < state->block_width; x++)
+        {
+            float pixel[3];
+            for (uniform int p = 0; p < 3; p++) pixel[p] = get_pixel(pset->pixels, p, x, y) - base[p];
+            float proj = dot3(pixel, dir) / sq_norm;
+            for (uniform int p = 0; p < 3; p++)
+            {
+                if (p == c - 1) 
+                {
+                    pca_alpha_error += sq(get_pixel(pset->pixels, p, x, y) - (proj * dir[p] + base[p]));
+                    alpha_error += sq(get_pixel(pset->pixels, p, x, y) - 255);
+                }
+                else
+                {
+                    pca_error += sq(get_pixel(pset->pixels, p, x, y) - (proj * dir[p] + base[p]));
+                }
+            }
+
+            float value = get_pixel(pset->pixels, 3, x, y);
+            ext[0] = min(ext[0], value);
+            ext[1] = max(ext[1], value);
+        }
+
+        state->pca_error[i][c] = pca_error + pca_alpha_error;
+        state->alpha_error[i][c] = alpha_error - pca_alpha_error;
+        state->sq_norm[i][c] = sq_norm + sq(ext[1] - ext[0]);
+        
+        // rotate back
+        rotate_plane(pset, c - 1);
+    }
+    
+    compute_dct_inplace(pset, 4);
+        
+    for (uniform int h = 2; h <= state->block_height; h++)
+    for (uniform int w = 2; w <= state->block_width; w++)
+    {
+        uniform int stride = 8;
+        uniform int pitch = 64;
+        
+        float sq_sum = 0;
+
+        for (uniform int y = 0; y < state->block_height; y++)
+        for (uniform int x = 0; x < state->block_width; x++)
+        {
+            if (y < h && x < w) continue;
+
+            for (uniform int p = 0; p < 4; p++)
+                sq_sum += sq(pset->pixels[pitch * p + stride * y + x]);
+        }
+
+        state->scale_error[h - 2][w - 2] = sq_sum;
+    }
+}
+
+float estimate_error(astc_rank_state state[], uniform astc_mode mode[])
+{
+    uniform int c = 0;
+    if (mode->dual_plane) c = 1 + mode->color_component_selector;
+
+    float scale_error = state->scale_error[mode->height - 2][mode->width - 2];
+    
+    uniform bool zero_based = (mode->color_endpoint_modes[0] % 4) == 2;
+    float pca_error = state->pca_error[zero_based][c];
+    float sq_norm = state->sq_norm[zero_based][c];
+
+    if (mode->color_endpoint_modes[0] <= 8) pca_error += state->alpha_error[zero_based][c];
+
+    uniform float sq_rcp_w_levels = get_sq_rcp_levels(mode->weight_range);
+    uniform float sq_rcp_ep_levels = get_sq_rcp_levels(mode->endpoint_range);
+    float quant_error = 0;
+
+    quant_error += 2 * sq_norm * sq_rcp_w_levels;
+    quant_error += 9000 * (state->block_height * state->block_width) * sq_rcp_ep_levels;
+    
+    float error = 0;
+    error += scale_error;
+    error += pca_error;
+    error += quant_error;
+
+    return error;
+}
+
+void insert_element(astc_rank_state state[], float error, uint32_t packed_mode, float threshold_error[])
+{
+    float max_error = 0;
+
+    for (uniform int k = 0; k < state->fastSkipTreshold; k++)
+    {
+        if (state->best_scores[k] > error)
+        {
+            swap(state->best_scores[k], error);
+            swap(state->best_modes[k], packed_mode);
+        }
+
+        max_error = max(max_error, state->best_scores[k]);
+    }
+
+    *threshold_error = max_error;
+}
+
+uniform static const int packed_modes_count = 3334;
+uniform static const uint32_t packed_modes[3334] =
+{
+    0x0006D400, 0x0016D340, 0x0026D380, 0x0036CDC0, 0x00469400, 0x00569401, 0x00668702, 0x00769440,
+    0x00868D41, 0x00969480, 0x00A68D81, 0x00B693C0, 0x00C688C1, 0x00D4D400, 0x00E4D401, 0x00F4C702,
+    0x0104D440, 0x0114CD41, 0x0124D480, 0x0134CD81, 0x0144D3C0, 0x0154C8C1, 0x01667400, 0x01767401,
+    0x01867302, 0x01966803, 0x01A67440, 0x01B67341, 0x01C66B42, 0x01D66443, 0x01E67480, 0x01F67381,
+    0x02066B82, 0x02166483, 0x022674C0, 0x02366DC1, 0x024667C2, 0x0253D400, 0x0263D401, 0x0273D302,
+    0x0283C803, 0x0293D440, 0x02A3D341, 0x02B3CB42, 0x02C3C443, 0x02D3D480, 0x02E3D381, 0x02F3CB82,
+    0x0303C483, 0x0313D4C0, 0x0323CDC1, 0x0333C7C2, 0x03465400, 0x03565401, 0x03665402, 0x03765403,
+    0x03865004, 0x03964705, 0x03A65440, 0x03B65441, 0x03C65342, 0x03D64E43, 0x03E64944, 0x03F65480,
+    0x04065481, 0x04165382, 0x04264E83, 0x04364984, 0x044654C0, 0x045652C1, 0x04664DC2, 0x047649C3,
+    0x048646C4, 0x049E5400, 0x04AE5240, 0x04BE5280, 0x04CE4DC0, 0x04DE5410, 0x04EE5250, 0x04FE5290,
+    0x050E4DD0, 0x051E5420, 0x052E5260, 0x053E52A0, 0x054E4DE0, 0x055E52B0, 0x056E4DF0, 0x0572D400,
+    0x0582D401, 0x0592D402, 0x05A2D403, 0x05B2D004, 0x05C2C705, 0x05D2D440, 0x05E2D441, 0x05F2D342,
+    0x0602CE43, 0x0612C944, 0x0622D480, 0x0632D481, 0x0642D382, 0x0652CE83, 0x0662C984, 0x0672D4C0,
+    0x0682D2C1, 0x0692CDC2, 0x06A2C9C3, 0x06B2C6C4, 0x06CAD400, 0x06DAD240, 0x06EAD280, 0x06FACDC0,
+    0x070AD410, 0x071AD250, 0x072AD290, 0x073ACDD0, 0x074AD420, 0x075AD260, 0x076AD2A0, 0x077ACDE0,
+    0x078AD2B0, 0x079ACDF0, 0x07A63400, 0x07B63401, 0x07C63402, 0x07D63403, 0x07E63404, 0x07F63405,
+    0x08063306, 0x08162E07, 0x08262708, 0x08363440, 0x08463441, 0x08563442, 0x08663443, 0x08763444,
+    0x08862F45, 0x08962B46, 0x08A62847, 0x08B63480, 0x08C63481, 0x08D63482, 0x08E63483, 0x08F63484,
+    0x09062F85, 0x09162B86, 0x09262887, 0x093634C0, 0x094634C1, 0x095633C2, 0x096630C3, 0x09762EC4,
+    0x09862AC5, 0x099627C6, 0x09A625C7, 0x09BE3400, 0x09CE3401, 0x09DE2502, 0x09EE3440, 0x09FE2C41,
+    0x0A0E3480, 0x0A1E2C81, 0x0A2E33C0, 0x0A3E28C1, 0x0A4E3410, 0x0A5E3411, 0x0A6E2512, 0x0A7E3450,
+    0x0A8E2C51, 0x0A9E3490, 0x0AAE2C91, 0x0ABE33D0, 0x0ACE28D1, 0x0ADE3420, 0x0AEE3421, 0x0AFE2522,
+    0x0B0E3460, 0x0B1E2C61, 0x0B2E34A0, 0x0B3E2CA1, 0x0B4E33E0, 0x0B5E28E1, 0x0B6E34B0, 0x0B7E2CB1,
+    0x0B8E33F0, 0x0B9E28F1, 0x0BA1D400, 0x0BB1D401, 0x0BC1D402, 0x0BD1D403, 0x0BE1D404, 0x0BF1D405,
+    0x0C01D306, 0x0C11CE07, 0x0C21C708, 0x0C31D440, 0x0C41D441, 0x0C51D442, 0x0C61D443, 0x0C71D444,
+    0x0C81CF45, 0x0C91CB46, 0x0CA1C847, 0x0CB1D480, 0x0CC1D481, 0x0CD1D482, 0x0CE1D483, 0x0CF1D484,
+    0x0D01CF85, 0x0D11CB86, 0x0D21C887, 0x0D31D4C0, 0x0D41D4C1, 0x0D51D3C2, 0x0D61D0C3, 0x0D71CEC4,
+    0x0D81CAC5, 0x0D91C7C6, 0x0DA1C5C7, 0x0DB9D400, 0x0DC9D401, 0x0DD9C502, 0x0DE9D440, 0x0DF9CC41,
+    0x0E09D480, 0x0E19CC81, 0x0E29D3C0, 0x0E39C8C1, 0x0E49D410, 0x0E59D411, 0x0E69C512, 0x0E79D450,
+    0x0E89CC51, 0x0E99D490, 0x0EA9CC91, 0x0EB9D3D0, 0x0EC9C8D1, 0x0ED9D420, 0x0EE9D421, 0x0EF9C522,
+    0x0F09D460, 0x0F19CC61, 0x0F29D4A0, 0x0F39CCA1, 0x0F49D3E0, 0x0F59C8E1, 0x0F69D4B0, 0x0F79CCB1,
+    0x0F89D3F0, 0x0F99C8F1, 0x0FA61401, 0x0FB61402, 0x0FC61403, 0x0FD61404, 0x0FE61405, 0x0FF61406,
+    0x10061407, 0x10161408, 0x10261409, 0x1036140A, 0x1046130B, 0x10561441, 0x10661442, 0x10761443,
+    0x10861444, 0x10961445, 0x10A61446, 0x10B61447, 0x10C61348, 0x10D61049, 0x10E60E4A, 0x10F60B4B,
+    0x11061481, 0x11161482, 0x11261483, 0x11361484, 0x11461485, 0x11561486, 0x11661487, 0x11761388,
+    0x11861089, 0x11960E8A, 0x11A60B8B, 0x11B614C1, 0x11C614C2, 0x11D614C3, 0x11E614C4, 0x11F613C5,
+    0x120611C6, 0x121610C7, 0x12260DC8, 0x12360BC9, 0x12460ACA, 0x125607CB, 0x126E1400, 0x127E1401,
+    0x128E1402, 0x129E1403, 0x12AE0E04, 0x12BE0505, 0x12CE1440, 0x12DE1441, 0x12EE1242, 0x12FE0D43,
+    0x130E0844, 0x131E1480, 0x132E1481, 0x133E1282, 0x134E0D83, 0x135E0884, 0x136E14C0, 0x137E11C1,
+    0x138E0DC2, 0x139E08C3, 0x13AE05C4, 0x13BE1410, 0x13CE1411, 0x13DE1412, 0x13EE1413, 0x13FE0E14,
+    0x140E0515, 0x141E1450, 0x142E1451, 0x143E1252, 0x144E0D53, 0x145E0854, 0x146E1490, 0x147E1491,
+    0x148E1292, 0x149E0D93, 0x14AE0894, 0x14BE14D0, 0x14CE11D1, 0x14DE0DD2, 0x14EE08D3, 0x14FE05D4,
+    0x150E1420, 0x151E1421, 0x152E1422, 0x153E1423, 0x154E0E24, 0x155E0525, 0x156E1460, 0x157E1461,
+    0x158E1262, 0x159E0D63, 0x15AE0864, 0x15BE14A0, 0x15CE14A1, 0x15DE12A2, 0x15EE0DA3, 0x15FE08A4,
+    0x160E14E0, 0x161E11E1, 0x162E0DE2, 0x163E08E3, 0x164E05E4, 0x165E14B0, 0x166E14B1, 0x167E12B2,
+    0x168E0DB3, 0x169E08B4, 0x16AE14F0, 0x16BE11F1, 0x16CE0DF2, 0x16DE08F3, 0x16EE05F4, 0x16F0D401,
+    0x1700D402, 0x1710D403, 0x1720D404, 0x1730D405, 0x1740D406, 0x1750D407, 0x1760D408, 0x1770D409,
+    0x1780D40A, 0x1790D30B, 0x17A0D441, 0x17B0D442, 0x17C0D443, 0x17D0D444, 0x17E0D445, 0x17F0D446,
+    0x1800D447, 0x1810D348, 0x1820D049, 0x1830CE4A, 0x1840CB4B, 0x1850D481, 0x1860D482, 0x1870D483,
+    0x1880D484, 0x1890D485, 0x18A0D486, 0x18B0D487, 0x18C0D388, 0x18D0D089, 0x18E0CE8A, 0x18F0CB8B,
+    0x1900D4C1, 0x1910D4C2, 0x1920D4C3, 0x1930D4C4, 0x1940D3C5, 0x1950D1C6, 0x1960D0C7, 0x1970CDC8,
+    0x1980CBC9, 0x1990CACA, 0x19A0C7CB, 0x19B8D400, 0x19C8D401, 0x19D8D402, 0x19E8D403, 0x19F8CE04,
+    0x1A08C505, 0x1A18D440, 0x1A28D441, 0x1A38D242, 0x1A48CD43, 0x1A58C844, 0x1A68D480, 0x1A78D481,
+    0x1A88D282, 0x1A98CD83, 0x1AA8C884, 0x1AB8D4C0, 0x1AC8D1C1, 0x1AD8CDC2, 0x1AE8C8C3, 0x1AF8C5C4,
+    0x1B08D410, 0x1B18D411, 0x1B28D412, 0x1B38D413, 0x1B48CE14, 0x1B58C515, 0x1B68D450, 0x1B78D451,
+    0x1B88D252, 0x1B98CD53, 0x1BA8C854, 0x1BB8D490, 0x1BC8D491, 0x1BD8D292, 0x1BE8CD93, 0x1BF8C894,
+    0x1C08D4D0, 0x1C18D1D1, 0x1C28CDD2, 0x1C38C8D3, 0x1C48C5D4, 0x1C58D420, 0x1C68D421, 0x1C78D422,
+    0x1C88D423, 0x1C98CE24, 0x1CA8C525, 0x1CB8D460, 0x1CC8D461, 0x1CD8D262, 0x1CE8CD63, 0x1CF8C864,
+    0x1D08D4A0, 0x1D18D4A1, 0x1D28D2A2, 0x1D38CDA3, 0x1D48C8A4, 0x1D58D4E0, 0x1D68D1E1, 0x1D78CDE2,
+    0x1D88C8E3, 0x1D98C5E4, 0x1DA8D4B0, 0x1DB8D4B1, 0x1DC8D2B2, 0x1DD8CDB3, 0x1DE8C8B4, 0x1DF8D4F0,
+    0x1E08D1F1, 0x1E18CDF2, 0x1E28C8F3, 0x1E38C5F4, 0x1E449400, 0x1E549401, 0x1E649402, 0x1E749003,
+    0x1E848804, 0x1E949440, 0x1EA49441, 0x1EB48F42, 0x1EC48943, 0x1ED48444, 0x1EE49480, 0x1EF49481,
+    0x1F048F82, 0x1F148983, 0x1F248484, 0x1F3494C0, 0x1F4490C1, 0x1F548AC2, 0x1F6486C3, 0x1F747400,
+    0x1F847401, 0x1F947402, 0x1FA47403, 0x1FB47404, 0x1FC46B05, 0x1FD47440, 0x1FE47441, 0x1FF47442,
+    0x20047043, 0x20146C44, 0x20246645, 0x20347480, 0x20447481, 0x20547482, 0x20647083, 0x20746C84,
+    0x20846685, 0x209474C0, 0x20A473C1, 0x20B46FC2, 0x20C46BC3, 0x20D468C4, 0x20E464C5, 0x20FC7400,
+    0x210C6501, 0x211C7440, 0x212C7480, 0x213C6EC0, 0x214C7410, 0x215C6511, 0x216C7450, 0x217C7490,
+    0x218C6ED0, 0x219C7420, 0x21AC6521, 0x21BC7460, 0x21CC74A0, 0x21DC6EE0, 0x21EC74B0, 0x21FC6EF0,
+    0x22039400, 0x22139401, 0x22239402, 0x22339403, 0x22439404, 0x22538B05, 0x22639440, 0x22739441,
+    0x22839442, 0x22939043, 0x22A38C44, 0x22B38645, 0x22C39480, 0x22D39481, 0x22E39482, 0x22F39083,
+    0x23038C84, 0x23138685, 0x232394C0, 0x233393C1, 0x23438FC2, 0x23538BC3, 0x236388C4, 0x237384C5,
+    0x238B9400, 0x239B8501, 0x23AB9440, 0x23BB9480, 0x23CB8EC0, 0x23DB9410, 0x23EB8511, 0x23FB9450,
+    0x240B9490, 0x241B8ED0, 0x242B9420, 0x243B8521, 0x244B9460, 0x245B94A0, 0x246B8EE0, 0x247B94B0,
+    0x248B8EF0, 0x24937400, 0x24A37401, 0x24B37402, 0x24C37403, 0x24D37404, 0x24E37405, 0x24F37006,
+    0x25036B07, 0x25137440, 0x25237441, 0x25337442, 0x25437443, 0x25537344, 0x25636E45, 0x25736946,
+    0x25836647, 0x25937480, 0x25A37481, 0x25B37482, 0x25C37483, 0x25D37384, 0x25E36E85, 0x25F36986,
+    0x26036687, 0x261374C0, 0x262374C1, 0x263373C2, 0x26436FC3, 0x26536DC4, 0x266369C5, 0x267366C6,
+    0x268364C7, 0x269B7400, 0x26AB7101, 0x26BB7440, 0x26CB6A41, 0x26DB7480, 0x26EB6A81, 0x26FB72C0,
+    0x270B67C1, 0x271B7410, 0x272B7111, 0x273B7450, 0x274B6A51, 0x275B7490, 0x276B6A91, 0x277B72D0,
+    0x278B67D1, 0x279B7420, 0x27AB7121, 0x27BB7460, 0x27CB6A61, 0x27DB74A0, 0x27EB6AA1, 0x27FB72E0,
+    0x280B67E1, 0x281B74B0, 0x282B6AB1, 0x283B72F0, 0x284B67F1, 0x28545400, 0x28645401, 0x28745402,
+    0x28845403, 0x28945404, 0x28A45405, 0x28B45306, 0x28C44E07, 0x28D44708, 0x28E45440, 0x28F45441,
+    0x29045442, 0x29145443, 0x29245444, 0x29344F45, 0x29444B46, 0x29544847, 0x29645480, 0x29745481,
+    0x29845482, 0x29945483, 0x29A45484, 0x29B44F85, 0x29C44B86, 0x29D44887, 0x29E454C0, 0x29F454C1,
+    0x2A0453C2, 0x2A1450C3, 0x2A244EC4, 0x2A344AC5, 0x2A4447C6, 0x2A5445C7, 0x2A6C5400, 0x2A7C5401,
+    0x2A8C4502, 0x2A9C5440, 0x2AAC4C41, 0x2ABC5480, 0x2ACC4C81, 0x2ADC53C0, 0x2AEC48C1, 0x2AFC5410,
+    0x2B0C5411, 0x2B1C4512, 0x2B2C5450, 0x2B3C4C51, 0x2B4C5490, 0x2B5C4C91, 0x2B6C53D0, 0x2B7C48D1,
+    0x2B8C5420, 0x2B9C5421, 0x2BAC4522, 0x2BBC5460, 0x2BCC4C61, 0x2BDC54A0, 0x2BEC4CA1, 0x2BFC53E0,
+    0x2C0C48E1, 0x2C1C54B0, 0x2C2C4CB1, 0x2C3C53F0, 0x2C4C48F1, 0x2C529400, 0x2C629401, 0x2C729402,
+    0x2C829403, 0x2C929404, 0x2CA29405, 0x2CB29306, 0x2CC28E07, 0x2CD28708, 0x2CE29440, 0x2CF29441,
+    0x2D029442, 0x2D129443, 0x2D229444, 0x2D328F45, 0x2D428B46, 0x2D528847, 0x2D629480, 0x2D729481,
+    0x2D829482, 0x2D929483, 0x2DA29484, 0x2DB28F85, 0x2DC28B86, 0x2DD28887, 0x2DE294C0, 0x2DF294C1,
+    0x2E0293C2, 0x2E1290C3, 0x2E228EC4, 0x2E328AC5, 0x2E4287C6, 0x2E5285C7, 0x2E6A9400, 0x2E7A9401,
+    0x2E8A8502, 0x2E9A9440, 0x2EAA8C41, 0x2EBA9480, 0x2ECA8C81, 0x2EDA93C0, 0x2EEA88C1, 0x2EFA9410,
+    0x2F0A9411, 0x2F1A8512, 0x2F2A9450, 0x2F3A8C51, 0x2F4A9490, 0x2F5A8C91, 0x2F6A93D0, 0x2F7A88D1,
+    0x2F8A9420, 0x2F9A9421, 0x2FAA8522, 0x2FBA9460, 0x2FCA8C61, 0x2FDA94A0, 0x2FEA8CA1, 0x2FFA93E0,
+    0x300A88E1, 0x301A94B0, 0x302A8CB1, 0x303A93F0, 0x304A88F1, 0x30535401, 0x30635402, 0x30735403,
+    0x30835404, 0x30935405, 0x30A35406, 0x30B35407, 0x30C35308, 0x30D34E09, 0x30E34A0A, 0x30F35441,
+    0x31035442, 0x31135443, 0x31235444, 0x31335445, 0x31435246, 0x31534F47, 0x31634B48, 0x31734849,
+    0x3183454A, 0x31935481, 0x31A35482, 0x31B35483, 0x31C35484, 0x31D35485, 0x31E35286, 0x31F34F87,
+    0x32034B88, 0x32134889, 0x3223458A, 0x323354C1, 0x324354C2, 0x325354C3, 0x326352C4, 0x32734FC5,
+    0x32834CC6, 0x32934AC7, 0x32A347C8, 0x32B345C9, 0x32CB5400, 0x32DB5401, 0x32EB5102, 0x32FB4703,
+    0x330B5440, 0x331B5241, 0x332B4A42, 0x333B5480, 0x334B5281, 0x335B4A82, 0x336B54C0, 0x337B4DC1,
+    0x338B47C2, 0x339B5410, 0x33AB5411, 0x33BB5112, 0x33CB4713, 0x33DB5450, 0x33EB5251, 0x33FB4A52,
+    0x340B5490, 0x341B5291, 0x342B4A92, 0x343B54D0, 0x344B4DD1, 0x345B47D2, 0x346B5420, 0x347B5421,
+    0x348B5122, 0x349B4723, 0x34AB5460, 0x34BB5261, 0x34CB4A62, 0x34DB54A0, 0x34EB52A1, 0x34FB4AA2,
+    0x350B54E0, 0x351B4DE1, 0x352B47E2, 0x353B54B0, 0x354B52B1, 0x355B4AB2, 0x356B54F0, 0x357B4DF1,
+    0x358B47F2, 0x35927401, 0x35A27402, 0x35B27403, 0x35C27404, 0x35D27405, 0x35E27406, 0x35F27407,
+    0x36027308, 0x36126E09, 0x36226A0A, 0x36327441, 0x36427442, 0x36527443, 0x36627444, 0x36727445,
+    0x36827246, 0x36926F47, 0x36A26B48, 0x36B26849, 0x36C2654A, 0x36D27481, 0x36E27482, 0x36F27483,
+    0x37027484, 0x37127485, 0x37227286, 0x37326F87, 0x37426B88, 0x37526889, 0x3762658A, 0x377274C1,
+    0x378274C2, 0x379274C3, 0x37A272C4, 0x37B26FC5, 0x37C26CC6, 0x37D26AC7, 0x37E267C8, 0x37F265C9,
+    0x380A7400, 0x381A7401, 0x382A7102, 0x383A6703, 0x384A7440, 0x385A7241, 0x386A6A42, 0x387A7480,
+    0x388A7281, 0x389A6A82, 0x38AA74C0, 0x38BA6DC1, 0x38CA67C2, 0x38DA7410, 0x38EA7411, 0x38FA7112,
+    0x390A6713, 0x391A7450, 0x392A7251, 0x393A6A52, 0x394A7490, 0x395A7291, 0x396A6A92, 0x397A74D0,
+    0x398A6DD1, 0x399A67D2, 0x39AA7420, 0x39BA7421, 0x39CA7122, 0x39DA6723, 0x39EA7460, 0x39FA7261,
+    0x3A0A6A62, 0x3A1A74A0, 0x3A2A72A1, 0x3A3A6AA2, 0x3A4A74E0, 0x3A5A6DE1, 0x3A6A67E2, 0x3A7A74B0,
+    0x3A8A72B1, 0x3A9A6AB2, 0x3AAA74F0, 0x3ABA6DF1, 0x3ACA67F2, 0x3AD25401, 0x3AE25402, 0x3AF25403,
+    0x3B025404, 0x3B125405, 0x3B225406, 0x3B325407, 0x3B425408, 0x3B525409, 0x3B62540A, 0x3B72530B,
+    0x3B825441, 0x3B925442, 0x3BA25443, 0x3BB25444, 0x3BC25445, 0x3BD25446, 0x3BE25447, 0x3BF25348,
+    0x3C025049, 0x3C124E4A, 0x3C224B4B, 0x3C325481, 0x3C425482, 0x3C525483, 0x3C625484, 0x3C725485,
+    0x3C825486, 0x3C925487, 0x3CA25388, 0x3CB25089, 0x3CC24E8A, 0x3CD24B8B, 0x3CE254C1, 0x3CF254C2,
+    0x3D0254C3, 0x3D1254C4, 0x3D2253C5, 0x3D3251C6, 0x3D4250C7, 0x3D524DC8, 0x3D624BC9, 0x3D724ACA,
+    0x3D8247CB, 0x3D9A5400, 0x3DAA5401, 0x3DBA5402, 0x3DCA5403, 0x3DDA4E04, 0x3DEA4505, 0x3DFA5440,
+    0x3E0A5441, 0x3E1A5242, 0x3E2A4D43, 0x3E3A4844, 0x3E4A5480, 0x3E5A5481, 0x3E6A5282, 0x3E7A4D83,
+    0x3E8A4884, 0x3E9A54C0, 0x3EAA51C1, 0x3EBA4DC2, 0x3ECA48C3, 0x3EDA45C4, 0x3EEA5410, 0x3EFA5411,
+    0x3F0A5412, 0x3F1A5413, 0x3F2A4E14, 0x3F3A4515, 0x3F4A5450, 0x3F5A5451, 0x3F6A5252, 0x3F7A4D53,
+    0x3F8A4854, 0x3F9A5490, 0x3FAA5491, 0x3FBA5292, 0x3FCA4D93, 0x3FDA4894, 0x3FEA54D0, 0x3FFA51D1,
+    0x400A4DD2, 0x401A48D3, 0x402A45D4, 0x403A5420, 0x404A5421, 0x405A5422, 0x406A5423, 0x407A4E24,
+    0x408A4525, 0x409A5460, 0x40AA5461, 0x40BA5262, 0x40CA4D63, 0x40DA4864, 0x40EA54A0, 0x40FA54A1,
+    0x410A52A2, 0x411A4DA3, 0x412A48A4, 0x413A54E0, 0x414A51E1, 0x415A4DE2, 0x416A48E3, 0x417A45E4,
+    0x418A54B0, 0x419A54B1, 0x41AA52B2, 0x41BA4DB3, 0x41CA48B4, 0x41DA54F0, 0x41EA51F1, 0x41FA4DF2,
+    0x420A48F3, 0x421A45F4, 0x42243401, 0x42343402, 0x42443403, 0x42543404, 0x42643405, 0x42743406,
+    0x42843407, 0x42943408, 0x42A43409, 0x42B4310A, 0x42C42B0B, 0x42D43441, 0x42E43442, 0x42F43443,
+    0x43043444, 0x43143445, 0x43243446, 0x43343347, 0x43442F48, 0x43542C49, 0x43642A4A, 0x4374264B,
+    0x43843481, 0x43943482, 0x43A43483, 0x43B43484, 0x43C43485, 0x43D43486, 0x43E43387, 0x43F42F88,
+    0x44042C89, 0x44142A8A, 0x4424268B, 0x443434C1, 0x444434C2, 0x445434C3, 0x446434C4, 0x447431C5,
+    0x44842FC6, 0x44942DC7, 0x44A42AC8, 0x44B428C9, 0x44C426CA, 0x44D424CB, 0x44EC3400, 0x44FC3401,
+    0x450C3402, 0x451C2E03, 0x452C2704, 0x453C3440, 0x454C3441, 0x455C2E42, 0x456C2843, 0x457C3480,
+    0x458C3481, 0x459C2E82, 0x45AC2883, 0x45BC34C0, 0x45CC2FC1, 0x45DC2AC2, 0x45EC25C3, 0x45FC3410,
+    0x460C3411, 0x461C3412, 0x462C2E13, 0x463C2714, 0x464C3450, 0x465C3451, 0x466C2E52, 0x467C2853,
+    0x468C3490, 0x469C3491, 0x46AC2E92, 0x46BC2893, 0x46CC34D0, 0x46DC2FD1, 0x46EC2AD2, 0x46FC25D3,
+    0x470C3420, 0x471C3421, 0x472C3422, 0x473C2E23, 0x474C2724, 0x475C3460, 0x476C3461, 0x477C2E62,
+    0x478C2863, 0x479C34A0, 0x47AC34A1, 0x47BC2EA2, 0x47CC28A3, 0x47DC34E0, 0x47EC2FE1, 0x47FC2AE2,
+    0x480C25E3, 0x481C34B0, 0x482C34B1, 0x483C2EB2, 0x484C28B3, 0x485C34F0, 0x486C2FF1, 0x487C2AF2,
+    0x488C25F3, 0x48919401, 0x48A19402, 0x48B19403, 0x48C19404, 0x48D19405, 0x48E19406, 0x48F19407,
+    0x49019408, 0x49119409, 0x4921910A, 0x49318B0B, 0x49419441, 0x49519442, 0x49619443, 0x49719444,
+    0x49819445, 0x49919446, 0x49A19347, 0x49B18F48, 0x49C18C49, 0x49D18A4A, 0x49E1864B, 0x49F19481,
+    0x4A019482, 0x4A119483, 0x4A219484, 0x4A319485, 0x4A419486, 0x4A519387, 0x4A618F88, 0x4A718C89,
+    0x4A818A8A, 0x4A91868B, 0x4AA194C1, 0x4AB194C2, 0x4AC194C3, 0x4AD194C4, 0x4AE191C5, 0x4AF18FC6,
+    0x4B018DC7, 0x4B118AC8, 0x4B2188C9, 0x4B3186CA, 0x4B4184CB, 0x4B599400, 0x4B699401, 0x4B799402,
+    0x4B898E03, 0x4B998704, 0x4BA99440, 0x4BB99441, 0x4BC98E42, 0x4BD98843, 0x4BE99480, 0x4BF99481,
+    0x4C098E82, 0x4C198883, 0x4C2994C0, 0x4C398FC1, 0x4C498AC2, 0x4C5985C3, 0x4C699410, 0x4C799411,
+    0x4C899412, 0x4C998E13, 0x4CA98714, 0x4CB99450, 0x4CC99451, 0x4CD98E52, 0x4CE98853, 0x4CF99490,
+    0x4D099491, 0x4D198E92, 0x4D298893, 0x4D3994D0, 0x4D498FD1, 0x4D598AD2, 0x4D6985D3, 0x4D799420,
+    0x4D899421, 0x4D999422, 0x4DA98E23, 0x4DB98724, 0x4DC99460, 0x4DD99461, 0x4DE98E62, 0x4DF98863,
+    0x4E0994A0, 0x4E1994A1, 0x4E298EA2, 0x4E3988A3, 0x4E4994E0, 0x4E598FE1, 0x4E698AE2, 0x4E7985E3,
+    0x4E8994B0, 0x4E9994B1, 0x4EA98EB2, 0x4EB988B3, 0x4EC994F0, 0x4ED98FF1, 0x4EE98AF2, 0x4EF985F3,
+    0x4F033401, 0x4F133402, 0x4F233403, 0x4F333404, 0x4F433405, 0x4F533406, 0x4F633407, 0x4F733408,
+    0x4F833409, 0x4F93340A, 0x4FA3340B, 0x4FB33441, 0x4FC33442, 0x4FD33443, 0x4FE33444, 0x4FF33445,
+    0x50033446, 0x50133447, 0x50233448, 0x50333349, 0x5043314A, 0x50532E4B, 0x50633481, 0x50733482,
+    0x50833483, 0x50933484, 0x50A33485, 0x50B33486, 0x50C33487, 0x50D33488, 0x50E33389, 0x50F3318A,
+    0x51032E8B, 0x511334C1, 0x512334C2, 0x513334C3, 0x514334C4, 0x515334C5, 0x516333C6, 0x517331C7,
+    0x51832FC8, 0x51932DC9, 0x51A32BCA, 0x51B329CB, 0x51CB3400, 0x51DB3401, 0x51EB3402, 0x51FB3403,
+    0x520B3304, 0x521B2A05, 0x522B3440, 0x523B3441, 0x524B3442, 0x525B2F43, 0x526B2B44, 0x527B2545,
+    0x528B3480, 0x529B3481, 0x52AB3482, 0x52BB2F83, 0x52CB2B84, 0x52DB2585, 0x52EB34C0, 0x52FB33C1,
+    0x530B2EC2, 0x531B2AC3, 0x532B27C4, 0x533B3410, 0x534B3411, 0x535B3412, 0x536B3413, 0x537B3314,
+    0x538B2A15, 0x539B3450, 0x53AB3451, 0x53BB3452, 0x53CB2F53, 0x53DB2B54, 0x53EB2555, 0x53FB3490,
+    0x540B3491, 0x541B3492, 0x542B2F93, 0x543B2B94, 0x544B2595, 0x545B34D0, 0x546B33D1, 0x547B2ED2,
+    0x548B2AD3, 0x549B27D4, 0x54AB3420, 0x54BB3421, 0x54CB3422, 0x54DB3423, 0x54EB3324, 0x54FB2A25,
+    0x550B3460, 0x551B3461, 0x552B3462, 0x553B2F63, 0x554B2B64, 0x555B2565, 0x556B34A0, 0x557B34A1,
+    0x558B34A2, 0x559B2FA3, 0x55AB2BA4, 0x55BB25A5, 0x55CB34E0, 0x55DB33E1, 0x55EB2EE2, 0x55FB2AE3,
+    0x560B27E4, 0x561B34B0, 0x562B34B1, 0x563B34B2, 0x564B2FB3, 0x565B2BB4, 0x566B25B5, 0x567B34F0,
+    0x568B33F1, 0x569B2EF2, 0x56AB2AF3, 0x56BB27F4, 0x56C17401, 0x56D17402, 0x56E17403, 0x56F17404,
+    0x57017405, 0x57117406, 0x57217407, 0x57317408, 0x57417409, 0x5751740A, 0x5761740B, 0x57717441,
+    0x57817442, 0x57917443, 0x57A17444, 0x57B17445, 0x57C17446, 0x57D17447, 0x57E17448, 0x57F17349,
+    0x5801714A, 0x58116E4B, 0x58217481, 0x58317482, 0x58417483, 0x58517484, 0x58617485, 0x58717486,
+    0x58817487, 0x58917488, 0x58A17389, 0x58B1718A, 0x58C16E8B, 0x58D174C1, 0x58E174C2, 0x58F174C3,
+    0x590174C4, 0x591174C5, 0x592173C6, 0x593171C7, 0x59416FC8, 0x59516DC9, 0x59616BCA, 0x597169CB,
+    0x59897400, 0x59997401, 0x59A97402, 0x59B97403, 0x59C97304, 0x59D96A05, 0x59E97440, 0x59F97441,
+    0x5A097442, 0x5A196F43, 0x5A296B44, 0x5A396545, 0x5A497480, 0x5A597481, 0x5A697482, 0x5A796F83,
+    0x5A896B84, 0x5A996585, 0x5AA974C0, 0x5AB973C1, 0x5AC96EC2, 0x5AD96AC3, 0x5AE967C4, 0x5AF97410,
+    0x5B097411, 0x5B197412, 0x5B297413, 0x5B397314, 0x5B496A15, 0x5B597450, 0x5B697451, 0x5B797452,
+    0x5B896F53, 0x5B996B54, 0x5BA96555, 0x5BB97490, 0x5BC97491, 0x5BD97492, 0x5BE96F93, 0x5BF96B94,
+    0x5C096595, 0x5C1974D0, 0x5C2973D1, 0x5C396ED2, 0x5C496AD3, 0x5C5967D4, 0x5C697420, 0x5C797421,
+    0x5C897422, 0x5C997423, 0x5CA97324, 0x5CB96A25, 0x5CC97460, 0x5CD97461, 0x5CE97462, 0x5CF96F63,
+    0x5D096B64, 0x5D196565, 0x5D2974A0, 0x5D3974A1, 0x5D4974A2, 0x5D596FA3, 0x5D696BA4, 0x5D7965A5,
+    0x5D8974E0, 0x5D9973E1, 0x5DA96EE2, 0x5DB96AE3, 0x5DC967E4, 0x5DD974B0, 0x5DE974B1, 0x5DF974B2,
+    0x5E096FB3, 0x5E196BB4, 0x5E2965B5, 0x5E3974F0, 0x5E4973F1, 0x5E596EF2, 0x5E696AF3, 0x5E7967F4,
+    0x5E823402, 0x5E923403, 0x5EA23404, 0x5EB23405, 0x5EC23406, 0x5ED23407, 0x5EE23408, 0x5EF23409,
+    0x5F02340A, 0x5F12340B, 0x5F223442, 0x5F323443, 0x5F423444, 0x5F523445, 0x5F623446, 0x5F723447,
+    0x5F823448, 0x5F923449, 0x5FA2344A, 0x5FB2344B, 0x5FC23482, 0x5FD23483, 0x5FE23484, 0x5FF23485,
+    0x60023486, 0x60123487, 0x60223488, 0x60323489, 0x6042348A, 0x6052348B, 0x606234C2, 0x607234C3,
+    0x608234C4, 0x609234C5, 0x60A234C6, 0x60B234C7, 0x60C233C8, 0x60D232C9, 0x60E230CA, 0x60F22FCB,
+    0x610A3400, 0x611A3401, 0x612A3402, 0x613A3403, 0x614A3404, 0x615A3405, 0x616A3106, 0x617A2C07,
+    0x618A2508, 0x619A3440, 0x61AA3441, 0x61BA3442, 0x61CA3443, 0x61DA3344, 0x61EA2E45, 0x61FA2A46,
+    0x620A2747, 0x621A3480, 0x622A3481, 0x623A3482, 0x624A3483, 0x625A3384, 0x626A2E85, 0x627A2A86,
+    0x628A2787, 0x629A34C0, 0x62AA34C1, 0x62BA33C2, 0x62CA30C3, 0x62DA2DC4, 0x62EA2AC5, 0x62FA27C6,
+    0x630A24C7, 0x631A3410, 0x632A3411, 0x633A3412, 0x634A3413, 0x635A3414, 0x636A3415, 0x637A3116,
+    0x638A2C17, 0x639A2518, 0x63AA3450, 0x63BA3451, 0x63CA3452, 0x63DA3453, 0x63EA3354, 0x63FA2E55,
+    0x640A2A56, 0x641A2757, 0x642A3490, 0x643A3491, 0x644A3492, 0x645A3493, 0x646A3394, 0x647A2E95,
+    0x648A2A96, 0x649A2797, 0x64AA34D0, 0x64BA34D1, 0x64CA33D2, 0x64DA30D3, 0x64EA2DD4, 0x64FA2AD5,
+    0x650A27D6, 0x651A24D7, 0x652A3420, 0x653A3421, 0x654A3422, 0x655A3423, 0x656A3424, 0x657A3425,
+    0x658A3126, 0x659A2C27, 0x65AA2528, 0x65BA3460, 0x65CA3461, 0x65DA3462, 0x65EA3463, 0x65FA3364,
+    0x660A2E65, 0x661A2A66, 0x662A2767, 0x663A34A0, 0x664A34A1, 0x665A34A2, 0x666A34A3, 0x667A33A4,
+    0x668A2EA5, 0x669A2AA6, 0x66AA27A7, 0x66BA34E0, 0x66CA34E1, 0x66DA33E2, 0x66EA30E3, 0x66FA2DE4,
+    0x670A2AE5, 0x671A27E6, 0x672A24E7, 0x673A34B0, 0x674A34B1, 0x675A34B2, 0x676A34B3, 0x677A33B4,
+    0x678A2EB5, 0x679A2AB6, 0x67AA27B7, 0x67BA34F0, 0x67CA34F1, 0x67DA33F2, 0x67EA30F3, 0x67FA2DF4,
+    0x680A2AF5, 0x681A27F6, 0x682A24F7, 0x68315402, 0x68415403, 0x68515404, 0x68615405, 0x68715406,
+    0x68815407, 0x68915408, 0x68A15409, 0x68B1540A, 0x68C1540B, 0x68D15442, 0x68E15443, 0x68F15444,
+    0x69015445, 0x69115446, 0x69215447, 0x69315448, 0x69415449, 0x6951544A, 0x6961544B, 0x69715482,
+    0x69815483, 0x69915484, 0x69A15485, 0x69B15486, 0x69C15487, 0x69D15488, 0x69E15489, 0x69F1548A,
+    0x6A01548B, 0x6A1154C2, 0x6A2154C3, 0x6A3154C4, 0x6A4154C5, 0x6A5154C6, 0x6A6154C7, 0x6A7153C8,
+    0x6A8152C9, 0x6A9150CA, 0x6AA14FCB, 0x6AB95400, 0x6AC95401, 0x6AD95402, 0x6AE95403, 0x6AF95404,
+    0x6B095405, 0x6B195106, 0x6B294C07, 0x6B394508, 0x6B495440, 0x6B595441, 0x6B695442, 0x6B795443,
+    0x6B895344, 0x6B994E45, 0x6BA94A46, 0x6BB94747, 0x6BC95480, 0x6BD95481, 0x6BE95482, 0x6BF95483,
+    0x6C095384, 0x6C194E85, 0x6C294A86, 0x6C394787, 0x6C4954C0, 0x6C5954C1, 0x6C6953C2, 0x6C7950C3,
+    0x6C894DC4, 0x6C994AC5, 0x6CA947C6, 0x6CB944C7, 0x6CC95410, 0x6CD95411, 0x6CE95412, 0x6CF95413,
+    0x6D095414, 0x6D195415, 0x6D295116, 0x6D394C17, 0x6D494518, 0x6D595450, 0x6D695451, 0x6D795452,
+    0x6D895453, 0x6D995354, 0x6DA94E55, 0x6DB94A56, 0x6DC94757, 0x6DD95490, 0x6DE95491, 0x6DF95492,
+    0x6E095493, 0x6E195394, 0x6E294E95, 0x6E394A96, 0x6E494797, 0x6E5954D0, 0x6E6954D1, 0x6E7953D2,
+    0x6E8950D3, 0x6E994DD4, 0x6EA94AD5, 0x6EB947D6, 0x6EC944D7, 0x6ED95420, 0x6EE95421, 0x6EF95422,
+    0x6F095423, 0x6F195424, 0x6F295425, 0x6F395126, 0x6F494C27, 0x6F594528, 0x6F695460, 0x6F795461,
+    0x6F895462, 0x6F995463, 0x6FA95364, 0x6FB94E65, 0x6FC94A66, 0x6FD94767, 0x6FE954A0, 0x6FF954A1,
+    0x700954A2, 0x701954A3, 0x702953A4, 0x70394EA5, 0x70494AA6, 0x705947A7, 0x706954E0, 0x707954E1,
+    0x708953E2, 0x709950E3, 0x70A94DE4, 0x70B94AE5, 0x70C947E6, 0x70D944E7, 0x70E954B0, 0x70F954B1,
+    0x710954B2, 0x711954B3, 0x712953B4, 0x71394EB5, 0x71494AB6, 0x715947B7, 0x716954F0, 0x717954F1,
+    0x718953F2, 0x719950F3, 0x71A94DF4, 0x71B94AF5, 0x71C947F6, 0x71D944F7, 0x71E13404, 0x71F13405,
+    0x72013406, 0x72113407, 0x72213408, 0x72313409, 0x7241340A, 0x7251340B, 0x72613444, 0x72713445,
+    0x72813446, 0x72913447, 0x72A13448, 0x72B13449, 0x72C1344A, 0x72D1344B, 0x72E13484, 0x72F13485,
+    0x73013486, 0x73113487, 0x73213488, 0x73313489, 0x7341348A, 0x7351348B, 0x736134C4, 0x737134C5,
+    0x738134C6, 0x739134C7, 0x73A134C8, 0x73B134C9, 0x73C134CA, 0x73D134CB, 0x73E93401, 0x73F93402,
+    0x74093403, 0x74193404, 0x74293405, 0x74393406, 0x74493407, 0x74593408, 0x74693309, 0x74792F0A,
+    0x74892A0B, 0x74993441, 0x74A93442, 0x74B93443, 0x74C93444, 0x74D93445, 0x74E93446, 0x74F93247,
+    0x75092E48, 0x75192B49, 0x7529294A, 0x7539254B, 0x75493481, 0x75593482, 0x75693483, 0x75793484,
+    0x75893485, 0x75993486, 0x75A93287, 0x75B92E88, 0x75C92B89, 0x75D9298A, 0x75E9258B, 0x75F934C1,
+    0x760934C2, 0x761934C3, 0x762933C4, 0x763930C5, 0x76492EC6, 0x76592CC7, 0x76692AC8, 0x767927C9,
+    0x768925CA, 0x76993411, 0x76A93412, 0x76B93413, 0x76C93414, 0x76D93415, 0x76E93416, 0x76F93417,
+    0x77093418, 0x77193319, 0x77292F1A, 0x77392A1B, 0x77493451, 0x77593452, 0x77693453, 0x77793454,
+    0x77893455, 0x77993456, 0x77A93257, 0x77B92E58, 0x77C92B59, 0x77D9295A, 0x77E9255B, 0x77F93491,
+    0x78093492, 0x78193493, 0x78293494, 0x78393495, 0x78493496, 0x78593297, 0x78692E98, 0x78792B99,
+    0x7889299A, 0x7899259B, 0x78A934D1, 0x78B934D2, 0x78C934D3, 0x78D933D4, 0x78E930D5, 0x78F92ED6,
+    0x79092CD7, 0x79192AD8, 0x792927D9, 0x793925DA, 0x79493421, 0x79593422, 0x79693423, 0x79793424,
+    0x79893425, 0x79993426, 0x79A93427, 0x79B93428, 0x79C93329, 0x79D92F2A, 0x79E92A2B, 0x79F93461,
+    0x7A093462, 0x7A193463, 0x7A293464, 0x7A393465, 0x7A493466, 0x7A593267, 0x7A692E68, 0x7A792B69,
+    0x7A89296A, 0x7A99256B, 0x7AA934A1, 0x7AB934A2, 0x7AC934A3, 0x7AD934A4, 0x7AE934A5, 0x7AF934A6,
+    0x7B0932A7, 0x7B192EA8, 0x7B292BA9, 0x7B3929AA, 0x7B4925AB, 0x7B5934E1, 0x7B6934E2, 0x7B7934E3,
+    0x7B8933E4, 0x7B9930E5, 0x7BA92EE6, 0x7BB92CE7, 0x7BC92AE8, 0x7BD927E9, 0x7BE925EA, 0x7BF934B1,
+    0x7C0934B2, 0x7C1934B3, 0x7C2934B4, 0x7C3934B5, 0x7C4934B6, 0x7C5932B7, 0x7C692EB8, 0x7C792BB9,
+    0x7C8929BA, 0x7C9925BB, 0x7CA934F1, 0x7CB934F2, 0x7CC934F3, 0x7CD933F4, 0x7CE930F5, 0x7CF92EF6,
+    0x7D092CF7, 0x7D192AF8, 0x7D2927F9, 0x7D3925FA, 0x7D441402, 0x7D541403, 0x7D641404, 0x7D741405,
+    0x7D841406, 0x7D941407, 0x7DA41408, 0x7DB41409, 0x7DC4140A, 0x7DD4140B, 0x7DE41442, 0x7DF41443,
+    0x7E041444, 0x7E141445, 0x7E241446, 0x7E341447, 0x7E441448, 0x7E541449, 0x7E64144A, 0x7E74144B,
+    0x7E841482, 0x7E941483, 0x7EA41484, 0x7EB41485, 0x7EC41486, 0x7ED41487, 0x7EE41488, 0x7EF41489,
+    0x7F04148A, 0x7F14148B, 0x7F2414C2, 0x7F3414C3, 0x7F4414C4, 0x7F5414C5, 0x7F6414C6, 0x7F7414C7,
+    0x7F8413C8, 0x7F9412C9, 0x7FA410CA, 0x7FB40FCB, 0x7FCC1400, 0x7FDC1401, 0x7FEC1402, 0x7FFC1403,
+    0x800C1404, 0x801C1405, 0x802C1106, 0x803C0C07, 0x804C0508, 0x805C1440, 0x806C1441, 0x807C1442,
+    0x808C1443, 0x809C1344, 0x80AC0E45, 0x80BC0A46, 0x80CC0747, 0x80DC1480, 0x80EC1481, 0x80FC1482,
+    0x810C1483, 0x811C1384, 0x812C0E85, 0x813C0A86, 0x814C0787, 0x815C14C0, 0x816C14C1, 0x817C13C2,
+    0x818C10C3, 0x819C0DC4, 0x81AC0AC5, 0x81BC07C6, 0x81CC04C7, 0x81DC1410, 0x81EC1411, 0x81FC1412,
+    0x820C1413, 0x821C1414, 0x822C1415, 0x823C1116, 0x824C0C17, 0x825C0518, 0x826C1450, 0x827C1451,
+    0x828C1452, 0x829C1453, 0x82AC1354, 0x82BC0E55, 0x82CC0A56, 0x82DC0757, 0x82EC1490, 0x82FC1491,
+    0x830C1492, 0x831C1493, 0x832C1394, 0x833C0E95, 0x834C0A96, 0x835C0797, 0x836C14D0, 0x837C14D1,
+    0x838C13D2, 0x839C10D3, 0x83AC0DD4, 0x83BC0AD5, 0x83CC07D6, 0x83DC04D7, 0x83EC1420, 0x83FC1421,
+    0x840C1422, 0x841C1423, 0x842C1424, 0x843C1425, 0x844C1126, 0x845C0C27, 0x846C0528, 0x847C1460,
+    0x848C1461, 0x849C1462, 0x84AC1463, 0x84BC1364, 0x84CC0E65, 0x84DC0A66, 0x84EC0767, 0x84FC14A0,
+    0x850C14A1, 0x851C14A2, 0x852C14A3, 0x853C13A4, 0x854C0EA5, 0x855C0AA6, 0x856C07A7, 0x857C14E0,
+    0x858C14E1, 0x859C13E2, 0x85AC10E3, 0x85BC0DE4, 0x85CC0AE5, 0x85DC07E6, 0x85EC04E7, 0x85FC14B0,
+    0x860C14B1, 0x861C14B2, 0x862C14B3, 0x863C13B4, 0x864C0EB5, 0x865C0AB6, 0x866C07B7, 0x867C14F0,
+    0x868C14F1, 0x869C13F2, 0x86AC10F3, 0x86BC0DF4, 0x86CC0AF5, 0x86DC07F6, 0x86EC04F7, 0x86F09402,
+    0x87009403, 0x87109404, 0x87209405, 0x87309406, 0x87409407, 0x87509408, 0x87609409, 0x8770940A,
+    0x8780940B, 0x87909442, 0x87A09443, 0x87B09444, 0x87C09445, 0x87D09446, 0x87E09447, 0x87F09448,
+    0x88009449, 0x8810944A, 0x8820944B, 0x88309482, 0x88409483, 0x88509484, 0x88609485, 0x88709486,
+    0x88809487, 0x88909488, 0x88A09489, 0x88B0948A, 0x88C0948B, 0x88D094C2, 0x88E094C3, 0x88F094C4,
+    0x890094C5, 0x891094C6, 0x892094C7, 0x893093C8, 0x894092C9, 0x895090CA, 0x89608FCB, 0x89789400,
+    0x89889401, 0x89989402, 0x89A89403, 0x89B89404, 0x89C89405, 0x89D89106, 0x89E88C07, 0x89F88508,
+    0x8A089440, 0x8A189441, 0x8A289442, 0x8A389443, 0x8A489344, 0x8A588E45, 0x8A688A46, 0x8A788747,
+    0x8A889480, 0x8A989481, 0x8AA89482, 0x8AB89483, 0x8AC89384, 0x8AD88E85, 0x8AE88A86, 0x8AF88787,
+    0x8B0894C0, 0x8B1894C1, 0x8B2893C2, 0x8B3890C3, 0x8B488DC4, 0x8B588AC5, 0x8B6887C6, 0x8B7884C7,
+    0x8B889410, 0x8B989411, 0x8BA89412, 0x8BB89413, 0x8BC89414, 0x8BD89415, 0x8BE89116, 0x8BF88C17,
+    0x8C088518, 0x8C189450, 0x8C289451, 0x8C389452, 0x8C489453, 0x8C589354, 0x8C688E55, 0x8C788A56,
+    0x8C888757, 0x8C989490, 0x8CA89491, 0x8CB89492, 0x8CC89493, 0x8CD89394, 0x8CE88E95, 0x8CF88A96,
+    0x8D088797, 0x8D1894D0, 0x8D2894D1, 0x8D3893D2, 0x8D4890D3, 0x8D588DD4, 0x8D688AD5, 0x8D7887D6,
+    0x8D8884D7, 0x8D989420, 0x8DA89421, 0x8DB89422, 0x8DC89423, 0x8DD89424, 0x8DE89425, 0x8DF89126,
+    0x8E088C27, 0x8E188528, 0x8E289460, 0x8E389461, 0x8E489462, 0x8E589463, 0x8E689364, 0x8E788E65,
+    0x8E888A66, 0x8E988767, 0x8EA894A0, 0x8EB894A1, 0x8EC894A2, 0x8ED894A3, 0x8EE893A4, 0x8EF88EA5,
+    0x8F088AA6, 0x8F1887A7, 0x8F2894E0, 0x8F3894E1, 0x8F4893E2, 0x8F5890E3, 0x8F688DE4, 0x8F788AE5,
+    0x8F8887E6, 0x8F9884E7, 0x8FA894B0, 0x8FB894B1, 0x8FC894B2, 0x8FD894B3, 0x8FE893B4, 0x8FF88EB5,
+    0x90088AB6, 0x901887B7, 0x902894F0, 0x903894F1, 0x904893F2, 0x905890F3, 0x90688DF4, 0x90788AF5,
+    0x908887F6, 0x909884F7, 0x90A31403, 0x90B31404, 0x90C31405, 0x90D31406, 0x90E31407, 0x90F31408,
+    0x91031409, 0x9113140A, 0x9123140B, 0x91331443, 0x91431444, 0x91531445, 0x91631446, 0x91731447,
+    0x91831448, 0x91931449, 0x91A3144A, 0x91B3144B, 0x91C31483, 0x91D31484, 0x91E31485, 0x91F31486,
+    0x92031487, 0x92131488, 0x92231489, 0x9233148A, 0x9243148B, 0x925314C3, 0x926314C4, 0x927314C5,
+    0x928314C6, 0x929314C7, 0x92A314C8, 0x92B314C9, 0x92C314CA, 0x92D313CB, 0x92EB1401, 0x92FB1402,
+    0x930B1403, 0x931B1404, 0x932B1405, 0x933B1406, 0x934B1407, 0x935B1108, 0x936B0C09, 0x937B080A,
+    0x938B1441, 0x939B1442, 0x93AB1443, 0x93BB1444, 0x93CB1445, 0x93DB1146, 0x93EB0E47, 0x93FB0A48,
+    0x940B0749, 0x941B044A, 0x942B1481, 0x943B1482, 0x944B1483, 0x945B1484, 0x946B1485, 0x947B1186,
+    0x948B0E87, 0x949B0A88, 0x94AB0789, 0x94BB048A, 0x94CB14C1, 0x94DB14C2, 0x94EB13C3, 0x94FB11C4,
+    0x950B0EC5, 0x951B0BC6, 0x952B0AC7, 0x953B07C8, 0x954B04C9, 0x955B1411, 0x956B1412, 0x957B1413,
+    0x958B1414, 0x959B1415, 0x95AB1416, 0x95BB1417, 0x95CB1118, 0x95DB0C19, 0x95EB081A, 0x95FB1451,
+    0x960B1452, 0x961B1453, 0x962B1454, 0x963B1455, 0x964B1156, 0x965B0E57, 0x966B0A58, 0x967B0759,
+    0x968B045A, 0x969B1491, 0x96AB1492, 0x96BB1493, 0x96CB1494, 0x96DB1495, 0x96EB1196, 0x96FB0E97,
+    0x970B0A98, 0x971B0799, 0x972B049A, 0x973B14D1, 0x974B14D2, 0x975B13D3, 0x976B11D4, 0x977B0ED5,
+    0x978B0BD6, 0x979B0AD7, 0x97AB07D8, 0x97BB04D9, 0x97CB1421, 0x97DB1422, 0x97EB1423, 0x97FB1424,
+    0x980B1425, 0x981B1426, 0x982B1427, 0x983B1128, 0x984B0C29, 0x985B082A, 0x986B1461, 0x987B1462,
+    0x988B1463, 0x989B1464, 0x98AB1465, 0x98BB1166, 0x98CB0E67, 0x98DB0A68, 0x98EB0769, 0x98FB046A,
+    0x990B14A1, 0x991B14A2, 0x992B14A3, 0x993B14A4, 0x994B14A5, 0x995B11A6, 0x996B0EA7, 0x997B0AA8,
+    0x998B07A9, 0x999B04AA, 0x99AB14E1, 0x99BB14E2, 0x99CB13E3, 0x99DB11E4, 0x99EB0EE5, 0x99FB0BE6,
+    0x9A0B0AE7, 0x9A1B07E8, 0x9A2B04E9, 0x9A3B14B1, 0x9A4B14B2, 0x9A5B14B3, 0x9A6B14B4, 0x9A7B14B5,
+    0x9A8B11B6, 0x9A9B0EB7, 0x9AAB0AB8, 0x9ABB07B9, 0x9ACB04BA, 0x9ADB14F1, 0x9AEB14F2, 0x9AFB13F3,
+    0x9B0B11F4, 0x9B1B0EF5, 0x9B2B0BF6, 0x9B3B0AF7, 0x9B4B07F8, 0x9B5B04F9, 0x9B607403, 0x9B707404,
+    0x9B807405, 0x9B907406, 0x9BA07407, 0x9BB07408, 0x9BC07409, 0x9BD0740A, 0x9BE0740B, 0x9BF07443,
+    0x9C007444, 0x9C107445, 0x9C207446, 0x9C307447, 0x9C407448, 0x9C507449, 0x9C60744A, 0x9C70744B,
+    0x9C807483, 0x9C907484, 0x9CA07485, 0x9CB07486, 0x9CC07487, 0x9CD07488, 0x9CE07489, 0x9CF0748A,
+    0x9D00748B, 0x9D1074C3, 0x9D2074C4, 0x9D3074C5, 0x9D4074C6, 0x9D5074C7, 0x9D6074C8, 0x9D7074C9,
+    0x9D8074CA, 0x9D9073CB, 0x9DA87401, 0x9DB87402, 0x9DC87403, 0x9DD87404, 0x9DE87405, 0x9DF87406,
+    0x9E087407, 0x9E187108, 0x9E286C09, 0x9E38680A, 0x9E487441, 0x9E587442, 0x9E687443, 0x9E787444,
+    0x9E887445, 0x9E987146, 0x9EA86E47, 0x9EB86A48, 0x9EC86749, 0x9ED8644A, 0x9EE87481, 0x9EF87482,
+    0x9F087483, 0x9F187484, 0x9F287485, 0x9F387186, 0x9F486E87, 0x9F586A88, 0x9F686789, 0x9F78648A,
+    0x9F8874C1, 0x9F9874C2, 0x9FA873C3, 0x9FB871C4, 0x9FC86EC5, 0x9FD86BC6, 0x9FE86AC7, 0x9FF867C8,
+    0xA00864C9, 0xA0187411, 0xA0287412, 0xA0387413, 0xA0487414, 0xA0587415, 0xA0687416, 0xA0787417,
+    0xA0887118, 0xA0986C19, 0xA0A8681A, 0xA0B87451, 0xA0C87452, 0xA0D87453, 0xA0E87454, 0xA0F87455,
+    0xA1087156, 0xA1186E57, 0xA1286A58, 0xA1386759, 0xA148645A, 0xA1587491, 0xA1687492, 0xA1787493,
+    0xA1887494, 0xA1987495, 0xA1A87196, 0xA1B86E97, 0xA1C86A98, 0xA1D86799, 0xA1E8649A, 0xA1F874D1,
+    0xA20874D2, 0xA21873D3, 0xA22871D4, 0xA2386ED5, 0xA2486BD6, 0xA2586AD7, 0xA26867D8, 0xA27864D9,
+    0xA2887421, 0xA2987422, 0xA2A87423, 0xA2B87424, 0xA2C87425, 0xA2D87426, 0xA2E87427, 0xA2F87128,
+    0xA3086C29, 0xA318682A, 0xA3287461, 0xA3387462, 0xA3487463, 0xA3587464, 0xA3687465, 0xA3787166,
+    0xA3886E67, 0xA3986A68, 0xA3A86769, 0xA3B8646A, 0xA3C874A1, 0xA3D874A2, 0xA3E874A3, 0xA3F874A4,
+    0xA40874A5, 0xA41871A6, 0xA4286EA7, 0xA4386AA8, 0xA44867A9, 0xA45864AA, 0xA46874E1, 0xA47874E2,
+    0xA48873E3, 0xA49871E4, 0xA4A86EE5, 0xA4B86BE6, 0xA4C86AE7, 0xA4D867E8, 0xA4E864E9, 0xA4F874B1,
+    0xA50874B2, 0xA51874B3, 0xA52874B4, 0xA53874B5, 0xA54871B6, 0xA5586EB7, 0xA5686AB8, 0xA57867B9,
+    0xA58864BA, 0xA59874F1, 0xA5A874F2, 0xA5B873F3, 0xA5C871F4, 0xA5D86EF5, 0xA5E86BF6, 0xA5F86AF7,
+    0xA60867F8, 0xA61864F9, 0xA6221405, 0xA6321406, 0xA6421407, 0xA6521408, 0xA6621409, 0xA672140A,
+    0xA682140B, 0xA6921445, 0xA6A21446, 0xA6B21447, 0xA6C21448, 0xA6D21449, 0xA6E2144A, 0xA6F2144B,
+    0xA7021485, 0xA7121486, 0xA7221487, 0xA7321488, 0xA7421489, 0xA752148A, 0xA762148B, 0xA77214C5,
+    0xA78214C6, 0xA79214C7, 0xA7A214C8, 0xA7B214C9, 0xA7C214CA, 0xA7D214CB, 0xA7EA1401, 0xA7FA1402,
+    0xA80A1403, 0xA81A1404, 0xA82A1405, 0xA83A1406, 0xA84A1407, 0xA85A1408, 0xA86A1409, 0xA87A140A,
+    0xA88A110B, 0xA89A1441, 0xA8AA1442, 0xA8BA1443, 0xA8CA1444, 0xA8DA1445, 0xA8EA1446, 0xA8FA1447,
+    0xA90A1248, 0xA91A0F49, 0xA92A0D4A, 0xA93A0A4B, 0xA94A1481, 0xA95A1482, 0xA96A1483, 0xA97A1484,
+    0xA98A1485, 0xA99A1486, 0xA9AA1487, 0xA9BA1288, 0xA9CA0F89, 0xA9DA0D8A, 0xA9EA0A8B, 0xA9FA14C1,
+    0xAA0A14C2, 0xAA1A14C3, 0xAA2A14C4, 0xAA3A13C5, 0xAA4A10C6, 0xAA5A0FC7, 0xAA6A0DC8, 0xAA7A0AC9,
+    0xAA8A09CA, 0xAA9A07CB, 0xAAAA1411, 0xAABA1412, 0xAACA1413, 0xAADA1414, 0xAAEA1415, 0xAAFA1416,
+    0xAB0A1417, 0xAB1A1418, 0xAB2A1419, 0xAB3A141A, 0xAB4A111B, 0xAB5A1451, 0xAB6A1452, 0xAB7A1453,
+    0xAB8A1454, 0xAB9A1455, 0xABAA1456, 0xABBA1457, 0xABCA1258, 0xABDA0F59, 0xABEA0D5A, 0xABFA0A5B,
+    0xAC0A1491, 0xAC1A1492, 0xAC2A1493, 0xAC3A1494, 0xAC4A1495, 0xAC5A1496, 0xAC6A1497, 0xAC7A1298,
+    0xAC8A0F99, 0xAC9A0D9A, 0xACAA0A9B, 0xACBA14D1, 0xACCA14D2, 0xACDA14D3, 0xACEA14D4, 0xACFA13D5,
+    0xAD0A10D6, 0xAD1A0FD7, 0xAD2A0DD8, 0xAD3A0AD9, 0xAD4A09DA, 0xAD5A07DB, 0xAD6A1421, 0xAD7A1422,
+    0xAD8A1423, 0xAD9A1424, 0xADAA1425, 0xADBA1426, 0xADCA1427, 0xADDA1428, 0xADEA1429, 0xADFA142A,
+    0xAE0A112B, 0xAE1A1461, 0xAE2A1462, 0xAE3A1463, 0xAE4A1464, 0xAE5A1465, 0xAE6A1466, 0xAE7A1467,
+    0xAE8A1268, 0xAE9A0F69, 0xAEAA0D6A, 0xAEBA0A6B, 0xAECA14A1, 0xAEDA14A2, 0xAEEA14A3, 0xAEFA14A4,
+    0xAF0A14A5, 0xAF1A14A6, 0xAF2A14A7, 0xAF3A12A8, 0xAF4A0FA9, 0xAF5A0DAA, 0xAF6A0AAB, 0xAF7A14E1,
+    0xAF8A14E2, 0xAF9A14E3, 0xAFAA14E4, 0xAFBA13E5, 0xAFCA10E6, 0xAFDA0FE7, 0xAFEA0DE8, 0xAFFA0AE9,
+    0xB00A09EA, 0xB01A07EB, 0xB02A14B1, 0xB03A14B2, 0xB04A14B3, 0xB05A14B4, 0xB06A14B5, 0xB07A14B6,
+    0xB08A14B7, 0xB09A12B8, 0xB0AA0FB9, 0xB0BA0DBA, 0xB0CA0ABB, 0xB0DA14F1, 0xB0EA14F2, 0xB0FA14F3,
+    0xB10A14F4, 0xB11A13F5, 0xB12A10F6, 0xB13A0FF7, 0xB14A0DF8, 0xB15A0AF9, 0xB16A09FA, 0xB17A07FB,
+    0xB1805405, 0xB1905406, 0xB1A05407, 0xB1B05408, 0xB1C05409, 0xB1D0540A, 0xB1E0540B, 0xB1F05445,
+    0xB2005446, 0xB2105447, 0xB2205448, 0xB2305449, 0xB240544A, 0xB250544B, 0xB2605485, 0xB2705486,
+    0xB2805487, 0xB2905488, 0xB2A05489, 0xB2B0548A, 0xB2C0548B, 0xB2D054C5, 0xB2E054C6, 0xB2F054C7,
+    0xB30054C8, 0xB31054C9, 0xB32054CA, 0xB33054CB, 0xB3485401, 0xB3585402, 0xB3685403, 0xB3785404,
+    0xB3885405, 0xB3985406, 0xB3A85407, 0xB3B85408, 0xB3C85409, 0xB3D8540A, 0xB3E8510B, 0xB3F85441,
+    0xB4085442, 0xB4185443, 0xB4285444, 0xB4385445, 0xB4485446, 0xB4585447, 0xB4685248, 0xB4784F49,
+    0xB4884D4A, 0xB4984A4B, 0xB4A85481, 0xB4B85482, 0xB4C85483, 0xB4D85484, 0xB4E85485, 0xB4F85486,
+    0xB5085487, 0xB5185288, 0xB5284F89, 0xB5384D8A, 0xB5484A8B, 0xB55854C1, 0xB56854C2, 0xB57854C3,
+    0xB58854C4, 0xB59853C5, 0xB5A850C6, 0xB5B84FC7, 0xB5C84DC8, 0xB5D84AC9, 0xB5E849CA, 0xB5F847CB,
+    0xB6085411, 0xB6185412, 0xB6285413, 0xB6385414, 0xB6485415, 0xB6585416, 0xB6685417, 0xB6785418,
+    0xB6885419, 0xB698541A, 0xB6A8511B, 0xB6B85451, 0xB6C85452, 0xB6D85453, 0xB6E85454, 0xB6F85455,
+    0xB7085456, 0xB7185457, 0xB7285258, 0xB7384F59, 0xB7484D5A, 0xB7584A5B, 0xB7685491, 0xB7785492,
+    0xB7885493, 0xB7985494, 0xB7A85495, 0xB7B85496, 0xB7C85497, 0xB7D85298, 0xB7E84F99, 0xB7F84D9A,
+    0xB8084A9B, 0xB81854D1, 0xB82854D2, 0xB83854D3, 0xB84854D4, 0xB85853D5, 0xB86850D6, 0xB8784FD7,
+    0xB8884DD8, 0xB8984AD9, 0xB8A849DA, 0xB8B847DB, 0xB8C85421, 0xB8D85422, 0xB8E85423, 0xB8F85424,
+    0xB9085425, 0xB9185426, 0xB9285427, 0xB9385428, 0xB9485429, 0xB958542A, 0xB968512B, 0xB9785461,
+    0xB9885462, 0xB9985463, 0xB9A85464, 0xB9B85465, 0xB9C85466, 0xB9D85467, 0xB9E85268, 0xB9F84F69,
+    0xBA084D6A, 0xBA184A6B, 0xBA2854A1, 0xBA3854A2, 0xBA4854A3, 0xBA5854A4, 0xBA6854A5, 0xBA7854A6,
+    0xBA8854A7, 0xBA9852A8, 0xBAA84FA9, 0xBAB84DAA, 0xBAC84AAB, 0xBAD854E1, 0xBAE854E2, 0xBAF854E3,
+    0xBB0854E4, 0xBB1853E5, 0xBB2850E6, 0xBB384FE7, 0xBB484DE8, 0xBB584AE9, 0xBB6849EA, 0xBB7847EB,
+    0xBB8854B1, 0xBB9854B2, 0xBBA854B3, 0xBBB854B4, 0xBBC854B5, 0xBBD854B6, 0xBBE854B7, 0xBBF852B8,
+    0xBC084FB9, 0xBC184DBA, 0xBC284ABB, 0xBC3854F1, 0xBC4854F2, 0xBC5854F3, 0xBC6854F4, 0xBC7853F5,
+    0xBC8850F6, 0xBC984FF7, 0xBCA84DF8, 0xBCB84AF9, 0xBCC849FA, 0xBCD847FB, 0xBCE11408, 0xBCF11409,
+    0xBD01140A, 0xBD11140B, 0xBD211448, 0xBD311449, 0xBD41144A, 0xBD51144B, 0xBD611488, 0xBD711489,
+    0xBD81148A, 0xBD91148B, 0xBDA114C8, 0xBDB114C9, 0xBDC114CA, 0xBDD114CB, 0xBDE91402, 0xBDF91403,
+    0xBE091404, 0xBE191405, 0xBE291406, 0xBE391407, 0xBE491408, 0xBE591409, 0xBE69140A, 0xBE79140B,
+    0xBE891442, 0xBE991443, 0xBEA91444, 0xBEB91445, 0xBEC91446, 0xBED91447, 0xBEE91448, 0xBEF91449,
+    0xBF09144A, 0xBF19144B, 0xBF291482, 0xBF391483, 0xBF491484, 0xBF591485, 0xBF691486, 0xBF791487,
+    0xBF891488, 0xBF991489, 0xBFA9148A, 0xBFB9148B, 0xBFC914C2, 0xBFD914C3, 0xBFE914C4, 0xBFF914C5,
+    0xC00914C6, 0xC01914C7, 0xC02913C8, 0xC03911C9, 0xC04910CA, 0xC0590ECB, 0xC0691412, 0xC0791413,
+    0xC0891414, 0xC0991415, 0xC0A91416, 0xC0B91417, 0xC0C91418, 0xC0D91419, 0xC0E9141A, 0xC0F9141B,
+    0xC1091452, 0xC1191453, 0xC1291454, 0xC1391455, 0xC1491456, 0xC1591457, 0xC1691458, 0xC1791459,
+    0xC189145A, 0xC199145B, 0xC1A91492, 0xC1B91493, 0xC1C91494, 0xC1D91495, 0xC1E91496, 0xC1F91497,
+    0xC2091498, 0xC2191499, 0xC229149A, 0xC239149B, 0xC24914D2, 0xC25914D3, 0xC26914D4, 0xC27914D5,
+    0xC28914D6, 0xC29914D7, 0xC2A913D8, 0xC2B911D9, 0xC2C910DA, 0xC2D90EDB, 0xC2E91422, 0xC2F91423,
+    0xC3091424, 0xC3191425, 0xC3291426, 0xC3391427, 0xC3491428, 0xC3591429, 0xC369142A, 0xC379142B,
+    0xC3891462, 0xC3991463, 0xC3A91464, 0xC3B91465, 0xC3C91466, 0xC3D91467, 0xC3E91468, 0xC3F91469,
+    0xC409146A, 0xC419146B, 0xC42914A2, 0xC43914A3, 0xC44914A4, 0xC45914A5, 0xC46914A6, 0xC47914A7,
+    0xC48914A8, 0xC49914A9, 0xC4A914AA, 0xC4B914AB, 0xC4C914E2, 0xC4D914E3, 0xC4E914E4, 0xC4F914E5,
+    0xC50914E6, 0xC51914E7, 0xC52913E8, 0xC53911E9, 0xC54910EA, 0xC5590EEB, 0xC56914B2, 0xC57914B3,
+    0xC58914B4, 0xC59914B5, 0xC5A914B6, 0xC5B914B7, 0xC5C914B8, 0xC5D914B9, 0xC5E914BA, 0xC5F914BB,
+    0xC60914F2, 0xC61914F3, 0xC62914F4, 0xC63914F5, 0xC64914F6, 0xC65914F7, 0xC66913F8, 0xC67911F9,
+    0xC68910FA, 0xC6990EFB, 0xC6A03408, 0xC6B03409, 0xC6C0340A, 0xC6D0340B, 0xC6E03448, 0xC6F03449,
+    0xC700344A, 0xC710344B, 0xC7203488, 0xC7303489, 0xC740348A, 0xC750348B, 0xC76034C8, 0xC77034C9,
+    0xC78034CA, 0xC79034CB, 0xC7A83402, 0xC7B83403, 0xC7C83404, 0xC7D83405, 0xC7E83406, 0xC7F83407,
+    0xC8083408, 0xC8183409, 0xC828340A, 0xC838340B, 0xC8483442, 0xC8583443, 0xC8683444, 0xC8783445,
+    0xC8883446, 0xC8983447, 0xC8A83448, 0xC8B83449, 0xC8C8344A, 0xC8D8344B, 0xC8E83482, 0xC8F83483,
+    0xC9083484, 0xC9183485, 0xC9283486, 0xC9383487, 0xC9483488, 0xC9583489, 0xC968348A, 0xC978348B,
+    0xC98834C2, 0xC99834C3, 0xC9A834C4, 0xC9B834C5, 0xC9C834C6, 0xC9D834C7, 0xC9E833C8, 0xC9F831C9,
+    0xCA0830CA, 0xCA182ECB, 0xCA283412, 0xCA383413, 0xCA483414, 0xCA583415, 0xCA683416, 0xCA783417,
+    0xCA883418, 0xCA983419, 0xCAA8341A, 0xCAB8341B, 0xCAC83452, 0xCAD83453, 0xCAE83454, 0xCAF83455,
+    0xCB083456, 0xCB183457, 0xCB283458, 0xCB383459, 0xCB48345A, 0xCB58345B, 0xCB683492, 0xCB783493,
+    0xCB883494, 0xCB983495, 0xCBA83496, 0xCBB83497, 0xCBC83498, 0xCBD83499, 0xCBE8349A, 0xCBF8349B,
+    0xCC0834D2, 0xCC1834D3, 0xCC2834D4, 0xCC3834D5, 0xCC4834D6, 0xCC5834D7, 0xCC6833D8, 0xCC7831D9,
+    0xCC8830DA, 0xCC982EDB, 0xCCA83422, 0xCCB83423, 0xCCC83424, 0xCCD83425, 0xCCE83426, 0xCCF83427,
+    0xCD083428, 0xCD183429, 0xCD28342A, 0xCD38342B, 0xCD483462, 0xCD583463, 0xCD683464, 0xCD783465,
+    0xCD883466, 0xCD983467, 0xCDA83468, 0xCDB83469, 0xCDC8346A, 0xCDD8346B, 0xCDE834A2, 0xCDF834A3,
+    0xCE0834A4, 0xCE1834A5, 0xCE2834A6, 0xCE3834A7, 0xCE4834A8, 0xCE5834A9, 0xCE6834AA, 0xCE7834AB,
+    0xCE8834E2, 0xCE9834E3, 0xCEA834E4, 0xCEB834E5, 0xCEC834E6, 0xCED834E7, 0xCEE833E8, 0xCEF831E9,
+    0xCF0830EA, 0xCF182EEB, 0xCF2834B2, 0xCF3834B3, 0xCF4834B4, 0xCF5834B5, 0xCF6834B6, 0xCF7834B7,
+    0xCF8834B8, 0xCF9834B9, 0xCFA834BA, 0xCFB834BB, 0xCFC834F2, 0xCFD834F3, 0xCFE834F4, 0xCFF834F5,
+    0xD00834F6, 0xD01834F7, 0xD02833F8, 0xD03831F9, 0xD04830FA, 0xD0582EFB,
+};
+
+uniform int get_bits(uniform uint32_t value, uniform int from, uniform int to)
+{
+    return (value >> from) & ((1 << (to + 1 - from)) - 1);
+}
+
+void load_mode_parameters(uniform astc_mode* uniform mode, uniform uint32_t packed_mode)
+{    
+    mode->width = 2 + get_bits(packed_mode, 13, 15); // 2..8 <= 2^3
+    mode->height = 2 + get_bits(packed_mode, 16, 18); // 2..8 <= 2^3
+    mode->dual_plane = get_bits(packed_mode, 19, 19); // 0 or 1
+    mode->partitions = 1;
+
+    mode->weight_range = get_bits(packed_mode, 0, 3);  // 0..11 <= 2^4
+    mode->color_component_selector = get_bits(packed_mode, 4, 5);  // 0..2 <= 2^2
+    mode->partition_id = 0;
+    mode->color_endpoint_modes[0] = get_bits(packed_mode, 6, 7) * 2 + 6; // 6 or 8
+    mode->color_endpoint_pairs = 1 + (mode->color_endpoint_modes[0] / 4);
+    mode->endpoint_range = get_bits(packed_mode, 8, 12); // 0..20 <= 2^5
+}
+
+export void astc_rank_ispc(uniform rgba_surface src[], uniform int xx, uniform int yy, uniform uint32_t mode_buffer[], uniform astc_enc_settings settings[])
+{
+    int tex_width = src->width / settings->block_width;
+    if (xx + programIndex >= tex_width) return;
+
+    astc_rank_state _state;
+    varying astc_rank_state* uniform state = &_state;
+
+    state->block_width = settings->block_width;
+    state->block_height = settings->block_height;
+    state->fastSkipTreshold = settings->fastSkipTreshold;
+
+    assert(state->fastSkipTreshold <= 64);
+
+    load_block_interleaved(state->pixels, src, xx + programIndex, yy, state->block_width, state->block_height);
+    if (settings->channels == 3) clear_alpha(state->pixels, state->block_width, state->block_height);
+
+    compute_metrics(state);
+
+    float threshold_error = 0;
+    int count = -1;
+
+    for (uniform int id = 0; id < packed_modes_count; id++)
+    {
+        uniform uint32_t packed_mode = packed_modes[id];
+
+        uniform astc_mode _mode;
+        uniform astc_mode* uniform mode = &_mode;
+        load_mode_parameters(mode, packed_mode);
+
+        if (mode->height > state->block_height) continue;
+        if (mode->width > state->block_width) continue;
+
+        if (settings->channels == 3 && mode->color_endpoint_modes[0] > 8) continue;
+
+        float error = estimate_error(state, mode);
+        count += 1;
+
+        if (count < state->fastSkipTreshold)
+        {
+            state->best_modes[count] = packed_mode;
+            state->best_scores[count] = error;
+
+            threshold_error = max(threshold_error, error);
+        }
+        else if (error < threshold_error)
+        {
+            insert_element(state, error, packed_mode, &threshold_error);
+        }
+    }
+
+    assert(count >= 0);
+
+    for (uniform int i = 0; i < state->fastSkipTreshold; i++)
+    {
+        mode_buffer[programCount * i + programIndex] = state->best_modes[i];
+    }
+}
+
+///////////////////////////////////////////////////////////
+//				 ASTC candidate encoding
+
+struct astc_block
+{
+    uniform int width;
+    uniform int height;
+    uniform uint8_t dual_plane;
+    int weight_range;
+    uint8_t weights[64];
+    int color_component_selector;
+
+    uniform int partitions;
+    int partition_id;
+    uniform int color_endpoint_pairs;
+    uniform int channels;
+    int color_endpoint_modes[4];
+    int endpoint_range;
+    uint8_t endpoints[18];
+};
+
+struct astc_enc_state
+{
+    float pixels[256];
+    float scaled_pixels[256];
+    uint32_t data[4];
+
+    // settings
+    uniform int block_width;
+    uniform int block_height;
+    uniform int pitch;
+
+    uniform int refineIterations;
+};
+
+struct astc_enc_context
+{
+    // uniform parameters
+    int width;
+    int height;
+    int channels;
+    bool dual_plane;
+    int partitions;
+    int color_endpoint_pairs;
+};
+
+uniform static const float filter_data[309] =
+{
+     0.688356,-0.188356, 0.414384, 0.085616, 0.085616, 0.414384,-0.188356, 0.688356,
+     0.955516,-0.227273, 0.044484, 0.142349, 0.727273,-0.142349,-0.142349, 0.727273,
+     0.142349, 0.044484,-0.227273, 0.955516, 0.600000,-0.200000, 0.400000, 0.000000,
+     0.200000, 0.200000, 0.000000, 0.400000,-0.200000, 0.600000, 0.828571,-0.142857,
+     0.028571, 0.342857, 0.285714,-0.057143,-0.142857, 0.714286,-0.142857,-0.057143,
+     0.285714, 0.342857, 0.028571,-0.142857, 0.828571, 0.985714,-0.252381, 0.080952,
+    -0.014286, 0.057143, 1.009524,-0.323810, 0.057143,-0.085714, 0.485714, 0.485714,
+    -0.085714, 0.057143,-0.323810, 1.009524, 0.057143,-0.014286, 0.080952,-0.252381,
+     0.985714, 0.510753,-0.177419, 0.381720,-0.048387, 0.252688, 0.080645, 0.080645,
+     0.252688,-0.048387, 0.381720,-0.177419, 0.510753, 0.754228,-0.194882, 0.052858,
+     0.398312, 0.147638,-0.040044,-0.016924, 0.547244,-0.148431,-0.148431, 0.547244,
+    -0.016924,-0.040044, 0.147638, 0.398312, 0.052858,-0.194882, 0.754228, 0.921235,
+    -0.216677, 0.063615,-0.013072, 0.210040, 0.577804,-0.169641, 0.034858,-0.164122,
+     0.798726,-0.053828, 0.011061, 0.011061,-0.053828, 0.798726,-0.164122, 0.034858,
+    -0.169641, 0.577804, 0.210040,-0.013072, 0.063615,-0.216677, 0.921235, 0.996932,
+    -0.209923, 0.069231,-0.020846, 0.003068, 0.016362, 1.119589,-0.369231, 0.111180,
+    -0.016362,-0.035452, 0.240891, 0.800000,-0.240891, 0.035452, 0.035452,-0.240891,
+     0.800000, 0.240891,-0.035452,-0.016362, 0.111180,-0.369231, 1.119589, 0.016362,
+     0.003068,-0.020846, 0.069231,-0.209923, 0.996932, 0.415909,-0.165909, 0.343182,
+    -0.093182, 0.234091, 0.015909, 0.161364, 0.088636, 0.088636, 0.161364, 0.015909,
+     0.234091,-0.093182, 0.343182,-0.165909, 0.415909, 0.653807,-0.172170, 0.058458,
+     0.395689, 0.040094,-0.013613, 0.189195, 0.209906,-0.071270,-0.068923, 0.422170,
+    -0.143341,-0.143341, 0.422170,-0.068923,-0.071270, 0.209906, 0.189195,-0.013613,
+     0.040094, 0.395689, 0.058458,-0.172170, 0.653807, 0.805363,-0.204713, 0.061406,
+    -0.016387, 0.363455, 0.220460,-0.066129, 0.017647,-0.078453, 0.645632,-0.193664,
+     0.051682,-0.121551, 0.455481, 0.081527,-0.021756,-0.021756, 0.081527, 0.455481,
+    -0.121551, 0.051682,-0.193664, 0.645632,-0.078453, 0.017647,-0.066129, 0.220460,
+     0.363455,-0.016387, 0.061406,-0.204713, 0.805363, 0.881593,-0.204539, 0.075065,
+    -0.021559, 0.004453, 0.270644, 0.467517,-0.171576, 0.049278,-0.010179,-0.169588,
+     0.821023,-0.159819, 0.045902,-0.009481,-0.012311, 0.059603, 0.756331,-0.217226,
+     0.044870, 0.044870,-0.217226, 0.756331, 0.059603,-0.012311,-0.009481, 0.045902,
+    -0.159819, 0.821023,-0.169588,-0.010179, 0.049278,-0.171576, 0.467517, 0.270644,
+     0.004453,-0.021559, 0.075065,-0.204539, 0.881593, 0.967275,-0.287351, 0.076902,
+    -0.018670, 0.005432,-0.000959, 0.104719, 0.919524,-0.246087, 0.059743,-0.017382,
+     0.003067,-0.127990, 0.653915, 0.300773,-0.073019, 0.021245,-0.003749, 0.064956,
+    -0.331864, 1.007366,-0.105833, 0.030792,-0.005434,-0.006723, 0.034349,-0.104266,
+     0.996397,-0.289905, 0.051160,-0.005112, 0.026120,-0.079287, 0.323158, 0.571013,
+    -0.100767, 0.003834,-0.019590, 0.059465,-0.242368, 0.905074, 0.075575,-0.000959,
+     0.004898,-0.014866, 0.060592,-0.226268, 0.981106,
+};
+
+uniform static const int filterbank[5][5] =
+{
+    {   0,   8,  -1,  -1,  -1 },
+    {  20,  30,  45,  -1,  -1 },
+    {  65,  77,  95, 119,  -1 },
+    {  -1,  -1,  -1,  -1,  -1 },
+    { 149, 165, 189, 221, 261 },
+};
+
+void scale_pixels(astc_enc_state state[], uniform astc_enc_context ctx[])
+{
+    uniform int channels = ctx->channels;
+    uniform const float* uniform yfilter = &filter_data[filterbank[state->block_height - 4][ctx->height - 2]];
+    uniform const float* uniform xfilter = &filter_data[filterbank[state->block_width - 4][ctx->width - 2]];
+
+    for (uniform int y = 0; y < ctx->height; y++)
+    {
+        float line[8][4];
+        
+        if (state->block_height == ctx->height)
+        {
+            for (uniform int x = 0; x < state->block_width; x++)
+            for (uniform int p = 0; p < channels; p++)
+                line[x][p] = get_pixel(state->pixels, p, x, y);
+        }
+        else
+        for (uniform int x = 0; x < state->block_width; x++)
+        {
+            uniform int n = ctx->height;
+
+            for (uniform int p = 0; p < channels; p++) line[x][p] = 0;
+
+            for (uniform int k = 0; k < state->block_height; k++)
+            for (uniform int p = 0; p < channels; p++)
+                line[x][p] += yfilter[k * n + y] * get_pixel(state->pixels, p, x, k);
+        }
+        
+        if (state->block_width == ctx->width)
+        {
+            for (uniform int x = 0; x < ctx->width; x++)
+            for (uniform int p = 0; p < channels; p++)
+                set_pixel(state->scaled_pixels, p, x, y, clamp(line[x][p], 0, 255));
+        }
+        else
+        for (uniform int x = 0; x < ctx->width; x++)
+        {
+            uniform int n = ctx->width;
+
+            float value[4] = { 0, 0, 0, 0 };
+
+            for (uniform int k = 0; k < state->block_width; k++)
+            for (uniform int p = 0; p < channels; p++)
+                value[p] += xfilter[k * n + x] * line[k][p];
+            
+            for (uniform int p = 0; p < channels; p++)
+                set_pixel(state->scaled_pixels, p, x, y, clamp(value[p], 0, 255));
+        }
+    }
+}
+
+inline int clamp_unorm8(int value)
+{
+    if (value < 0) return 0;
+    if (value > 255) return 255;
+    return value;
+}
+
+inline void apply_blue_contract(int& r, int& g, int& b)
+{
+    r = (r + b) >> 1;
+    g = (g + b) >> 1;
+}
+
+void decode_endpoints(float endpoints[8], uint8_t coded_endpoints[], int mode)
+{    
+    if ((mode % 4) == 2)
+    {
+        int v0 = coded_endpoints[0];
+        int v1 = coded_endpoints[1];
+        int v2 = coded_endpoints[2];
+        int v3 = coded_endpoints[3];
+        int v4 = coded_endpoints[4];
+        int v5 = coded_endpoints[5];
+
+        endpoints[0] = (v0 * v3) >> 8;
+        endpoints[1] = (v1 * v3) >> 8;
+        endpoints[2] = (v2 * v3) >> 8;
+        endpoints[3] = 0xFF;
+
+        endpoints[4] = v0;
+        endpoints[5] = v1;
+        endpoints[6] = v2;
+        endpoints[7] = 0xFF;
+
+        if (mode > 8)
+        {
+            endpoints[3] = clamp_unorm8(v4);
+            endpoints[7] = clamp_unorm8(v5);
+        }
+    }
+
+    if ((mode % 4) == 0)
+    {
+        int v0 = coded_endpoints[0];
+        int v1 = coded_endpoints[1];
+        int v2 = coded_endpoints[2];
+        int v3 = coded_endpoints[3];
+        int v4 = coded_endpoints[4];
+        int v5 = coded_endpoints[5];
+        int v6 = coded_endpoints[6];
+        int v7 = coded_endpoints[7];
+
+        bool swap_endpoints = v1 + v3 + v5 < v0 + v2 + v4;
+
+        if (swap_endpoints)
+        {
+            swap(v0, v1);
+            swap(v2, v3);
+            swap(v4, v5);
+            swap(v6, v7);
+
+            apply_blue_contract(v0, v2, v4);
+            apply_blue_contract(v1, v3, v5);
+        }        
+
+        endpoints[0] = clamp_unorm8(v0);
+        endpoints[1] = clamp_unorm8(v2);
+        endpoints[2] = clamp_unorm8(v4);
+        endpoints[3] = 0xFF;
+
+        endpoints[4] = clamp_unorm8(v1);
+        endpoints[5] = clamp_unorm8(v3);
+        endpoints[6] = clamp_unorm8(v5);
+        endpoints[7] = 0xFF;
+
+        if (mode > 8)
+        {
+            endpoints[3] = clamp_unorm8(v6);
+            endpoints[7] = clamp_unorm8(v7);
+        }
+    }
+}
+
+void dequant_decode_endpoints(float endpoints[8], uint8_t block_endpoints[], int mode, int range)
+{
+    int levels = get_levels(range);
+    int num_cem_pairs = 1 + mode / 4;
+
+    uint8_t dequant_endpoints[8];
+    for (uniform int k = 0; k < 2 * num_cem_pairs; k++)
+    {
+        dequant_endpoints[k] = (int)(((int)block_endpoints[k]) * 255.0f / (levels - 1) + 0.5);
+    }
+
+    decode_endpoints(endpoints, dequant_endpoints, mode);
+}
+
+bool compare_endpoints(uint8_t endpoints[8], astc_block block[])
+{
+    int sum = 0;
+    for (uniform int p = 0; p < 3; p++)
+    {
+        sum += endpoints[p * 2 + 0];
+        sum -= endpoints[p * 2 + 1];
+    }
+    
+    if (-2 <= sum && sum <= 2)
+    {
+        // avoid being too close so we don't need proper rounding 
+        for (uniform int p = 0; p < 3; p++)
+        {
+            if (sum<=0)
+                endpoints[p * 2 + 0] = clamp(endpoints[p * 2 + 0] - 1, 0, get_levels(block->endpoint_range) - 1);
+            if (sum>0)
+                endpoints[p * 2 + 1] = clamp(endpoints[p * 2 + 1] - 1, 0, get_levels(block->endpoint_range) - 1);
+        }
+
+        sum = 0;
+        for (uniform int p = 0; p < 3; p++)
+        {
+            sum += endpoints[p * 2 + 0];
+            sum -= endpoints[p * 2 + 1];
+        }
+    }
+
+    return sum > 0;
+}
+
+void reorder_endpoints(uint8_t endpoints[8], astc_block block[], bool blue_contract)
+{    
+    if (compare_endpoints(endpoints, block) == !blue_contract)
+    for (uniform int p = 0; p < 4; p++) swap(endpoints[p * 2], endpoints[p * 2 + 1]);
+}
+
+inline int quant_endpoint(float value, int levels)
+{
+    return clamp(value / 255.0f * (levels - 1) + 0.5, 0, levels - 1);
+}
+
+void quantize_endpoints_scale(astc_block block[], float endpoints[4])
+{
+    int ep_levels = get_levels(block->endpoint_range);
+
+    float near[3];
+    float far[3];
+    for (uniform int p = 0; p < 3; p++)
+    {
+        near[p] = endpoints[p * 2 + 0];
+        far[p] = endpoints[p * 2 + 1];
+    }
+
+    for (uniform int p = 0; p < 3; p++)
+        block->endpoints[p] = quant_endpoint(far[p], ep_levels);
+
+    float sq_norm = dot3(far, far) + 0.00001;
+    float scale = dot3(far, near) / sq_norm;
+
+    block->endpoints[3] = quant_endpoint(scale * 256, ep_levels);
+
+    if (block->color_endpoint_modes[0] > 8)
+    {
+        block->endpoints[4] = quant_endpoint(endpoints[3 * 2 + 0], ep_levels);
+        block->endpoints[5] = quant_endpoint(endpoints[3 * 2 + 1], ep_levels);
+    }
+}
+
+void quantize_endpoints_pair(astc_block block[], float endpoints[6])
+{    
+    int ep_levels = get_levels(block->endpoint_range);
+
+    bool blue_contract = true;
+
+    float blue_compressed[6];
+    for (uniform int i = 0; i < 2; i++)
+    {
+        blue_compressed[i + 0] = endpoints[i + 0] * 2 - endpoints[i + 4];
+        blue_compressed[i + 2] = endpoints[i + 2] * 2 - endpoints[i + 4];
+        blue_compressed[i + 4] = endpoints[i + 4];
+
+        if (blue_compressed[i + 0] < 0) blue_contract = false;
+        if (blue_compressed[i + 0] > 255) blue_contract = false;
+        if (blue_compressed[i + 2] < 0) blue_contract = false;
+        if (blue_compressed[i + 2] > 255) blue_contract = false;
+    }
+
+    if (blue_contract)
+    {
+        for (uniform int p = 0; p < 3; p++)
+        {
+            block->endpoints[p * 2 + 0] = quant_endpoint(blue_compressed[p * 2 + 0], ep_levels);
+            block->endpoints[p * 2 + 1] = quant_endpoint(blue_compressed[p * 2 + 1], ep_levels);
+        }
+    }
+    else
+    {
+        for (uniform int p = 0; p < 3; p++)
+        {
+            block->endpoints[p * 2 + 0] = quant_endpoint(endpoints[p * 2 + 0], ep_levels);
+            block->endpoints[p * 2 + 1] = quant_endpoint(endpoints[p * 2 + 1], ep_levels);
+        }
+    }
+
+    if (block->color_endpoint_modes[0] > 8)
+    {
+        block->endpoints[6] = quant_endpoint(endpoints[3 * 2 + 0], ep_levels);
+        block->endpoints[7] = quant_endpoint(endpoints[3 * 2 + 1], ep_levels);
+    }
+
+    reorder_endpoints(block->endpoints, block, blue_contract);
+}
+
+void quantize_endpoints(astc_block block[], float endpoints[])
+{
+    bool zero_based = (block->color_endpoint_modes[0] % 4) == 2;
+
+    if (zero_based)
+    {
+        quantize_endpoints_scale(block, endpoints);
+    }
+    else
+    {
+        quantize_endpoints_pair(block, endpoints);
+    }
+}
+
+void opt_weights(float scaled_pixels[], astc_block block[])
+{
+    uniform int channels = 4;
+    if (block->dual_plane) channels = 3;
+
+    float rec_endpoints[8];
+    dequant_decode_endpoints(rec_endpoints, block->endpoints, block->color_endpoint_modes[0], block->endpoint_range);
+
+    int w_levels = get_levels(block->weight_range);
+
+    float dir[4]; dir[3] = 0;
+    for (uniform int p = 0; p < channels; p++) dir[p] = rec_endpoints[4 + p] - rec_endpoints[0 + p];
+    float sq_norm = dot4(dir, dir) + 0.00001;
+    for (uniform int p = 0; p < channels; p++) dir[p] *= (w_levels - 1) / sq_norm;
+
+    for (uniform int y = 0; y < block->height; y++)
+    for (uniform int x = 0; x < block->width; x++)
+    {
+        float pixel[4]; pixel[3] = 0;
+        for (uniform int p = 0; p < channels; p++) pixel[p] = get_pixel(scaled_pixels, p, x, y) - rec_endpoints[0 + p];
+        
+        int q = clamp(dot4(pixel, dir) + 0.5, 0, w_levels - 1);
+
+        block->weights[y * block->width + x] = q;
+    }
+}
+
+bool sgesv2(float A[4], float x[2], float b[2])
+{
+    float det = (A[0] * A[3] - A[1] * A[2]);
+    if(det == 0)
+        return false;
+    float inv_det = 1.0f / det;
+    x[0] = (b[0] * +A[3] + b[1] * -A[2])*inv_det;
+    x[1] = (b[0] * -A[1] + b[1] * +A[0])*inv_det;
+
+    return true;
+}
+
+void ls_refine_scale(float endpoints[8], float scaled_pixels[], astc_block block[])
+{
+    int levels = get_levels(block->weight_range);
+    float levels_rcp = 1.0f / (levels - 1);
+
+    // In this mode, the endpoints are on a line through 0, and the first endpoint
+    // is a scaled version of the second endpoint with a scale factor 0 <= s < 1.
+    //
+    // Determining optimal s and endpoints is a non-linear problem; approximate it
+    // by first solving for the scale factor and then separately solving for the
+    // endpoint value.
+    //
+    // The scale factor solve starts from (where e_0, e_1 are RGB endpoints)
+    //    e_0 = s * e_1
+    //
+    // thus for an interpolation weight w
+    //   lerp(e_0, e_1, w)
+    //   = lerp(s * e_1,  e_1, w)
+    //   = lerp(s, 1, w) * e_1
+    //   = (s + (1-s)*w) * e_1
+    //
+    // and if we take look at the 2-norm (Euclidean length) of that vector
+    //
+    //   len(lerp(e_0, e_1, w)) = (s + (1-s)*w) * len(e_1)
+    //
+    // if we consider s and (1-s) as separate unknowns xx_0 and xx_1, we get
+    // an overdetermined linear system for the 2D vector lengths of the pixels
+    // (d_i in the following) that we can solve in a Least-Squares sense via
+    // the Normal Equations and Cramer's rule:
+    //
+    // [1 w_1]          [d_1]
+    // [1 w_2] [xx_0]   [d_2]
+    // [1 w_3] [xx_1] = [d_3]
+    // [ ... ]          [...]
+    // [1 w_N]          [d_N]
+    //
+    // we then recover s (the ratio between the lengths of e_0 and e_1) as
+    // xx_0 / (xx_0 + xx_1).
+    float sum_w = 0;
+    float sum_ww = 0;
+    float sum_d = 0;
+    float sum_wd = 0;
+
+    for (uniform int y = 0; y < block->height; y++)
+    for (uniform int x = 0; x < block->width; x++)
+    {
+        float w = (int)block->weights[y * block->width + x] * levels_rcp;
+        float d = 0;
+
+        for (uniform int p = 0; p < 3; p++) d += sq(get_pixel(scaled_pixels, p, x, y));
+        d = sqrt(d+0.01f);
+
+        sum_w += w;
+        sum_ww += w*w;
+        sum_d += d;
+        sum_wd += w*d;
+    }
+
+    float sum_1 = 1.0f * block->height * block->width;
+
+    float C[4] = { sum_1, sum_w, sum_w, sum_ww };
+    float b[2] = { sum_d, sum_wd };
+    float xx[2];
+
+    // Singular configurations are precisely those where all weights are equal, i.e. constant color.
+    // Used to set scale=0 in this case but that's really bad when that weight is 0 (we're now making
+    // the whole block black). scale=1 isn't ideal (and in fact not something we can hit exactly)
+    // but at least leaves the endpoints in roughly the right place.
+    float scale = 1;
+    if(sgesv2(C, xx, b))
+    {
+        scale = xx[0] / (xx[1] + xx[0]);
+        if (xx[1] + xx[0] < 1) scale = 1;
+        scale = clamp(scale, 0.0f, 0.9999f); // note: clamp also takes care of possible NaNs        
+    }
+
+    // Now, solve another Least Squares system for the actual endpoint values given the previously
+    // determined scale. This time we're trying to solve, for every pixel p_i
+    //
+    //   p_i = lerp(e_0, e_1, w) = (s + (1-s)*w_i) * e_1
+    //
+    // let z_i := s + (1 - s)*w_i, then we get the overdetermined linear system
+    //
+    // [z_1]           [p_1^T]
+    // [z_2]           [p_2^T]
+    // [z_3] [e_1]^T = [p_3^T]
+    // [...]           [ ... ]
+    // [z_N]           [p_N^T]
+    //
+    // which we solve in a least-squares sense for e_1.
+    float sum_zz = 0;
+    float sum_zp[3] = { 0, 0, 0 };
+        
+    for (uniform int y = 0; y < block->height; y++)
+    for (uniform int x = 0; x < block->width; x++)
+    {
+        float w = (int)block->weights[y * block->width + x] * levels_rcp;
+        float z = scale + (1 - scale)*w;
+
+        sum_zz += z * z;
+        for (uniform int p = 0; p < 3; p++) sum_zp[p] += z * get_pixel(scaled_pixels, p, x, y);
+    }
+
+    for (uniform int p = 0; p < 3; p++) endpoints[2 * p + 0] = scale  * sum_zp[p] / sum_zz;
+    for (uniform int p = 0; p < 3; p++) endpoints[2 * p + 1] = sum_zp[p] / sum_zz;
+
+    if (block->channels == 4)
+    {
+        float Atb1 = 0;
+        float sum_q = 0;
+        float sum_qq = 0;
+        float sum[2] = { 0, 0 };
+
+        for (uniform int y = 0; y < block->height; y++)
+        for (uniform int x = 0; x < block->width; x++)
+        {
+            int q = block->weights[y * block->width + x];
+            int z = (levels - 1) - q;
+
+            sum_q += q;
+            sum_qq += q*q;
+
+            sum[1] += 1;
+            sum[0] += get_pixel(scaled_pixels, 3, x, y);
+            Atb1 += z * get_pixel(scaled_pixels, 3, x, y);
+        }
+
+        float Atb2 = (levels - 1)*sum[0] - Atb1;
+
+        float Cxx = sum[1] * sq(levels - 1) - 2 * (levels - 1)*sum_q + sum_qq;
+        float Cyy = sum_qq;
+        float Cxy = (levels - 1)*sum_q - sum_qq;
+        float scale = 1.0f / (Cxx*Cyy - Cxy*Cxy);
+
+        float ep[8];   
+        ep[0 + 3] = (levels - 1)*(Atb1 * Cyy - Atb2 * Cxy)*scale;
+        ep[4 + 3] = (levels - 1)*(Atb2 * Cxx - Atb1 * Cxy)*scale;
+   
+        if (abs(Cxx*Cyy - Cxy*Cxy) < 0.001)
+        {
+            ep[0 + 3] = sum[0] / sum[1];
+            ep[4 + 3] = ep[0 + 3];
+        }
+
+        endpoints[6 + 0] = ep[0 + 3];
+        endpoints[6 + 1] = ep[4 + 3];
+    }
+}
+
+void ls_refine_pair(float endpoints[6], float scaled_pixels[], astc_block block[])
+{
+    uniform int channels = block->channels;
+    int levels = get_levels(block->weight_range);
+    
+    float Atb1[4] = { 0, 0, 0, 0 };
+    float sum_q = 0;
+    float sum_qq = 0;
+    float sum[5] = { 0, 0, 0, 0, 0 };
+
+    for (uniform int y = 0; y < block->height; y++)
+    for (uniform int x = 0; x < block->width; x++)
+    {
+        int q = block->weights[y * block->width + x];
+
+        int z = (levels - 1) - q;
+
+        sum_q += q;
+        sum_qq += q*q;
+
+        sum[4] += 1;
+        for (uniform int p = 0; p < channels; p++) sum[p] += get_pixel(scaled_pixels, p, x, y);
+        for (uniform int p = 0; p < channels; p++) Atb1[p] += z * get_pixel(scaled_pixels, p, x, y);
+    }
+
+    float Atb2[4];
+    for (uniform int p = 0; p < channels; p++)
+    {
+        Atb2[p] = (levels - 1)*sum[p] - Atb1[p];
+    }
+
+    float Cxx = sum[4] * sq(levels - 1) - 2 * (levels - 1)*sum_q + sum_qq;
+    float Cyy = sum_qq;
+    float Cxy = (levels - 1)*sum_q - sum_qq;
+    float scale = 1.0f / (Cxx*Cyy - Cxy*Cxy);
+
+    float ep[8];
+    for (uniform int p = 0; p < channels; p++)
+    {
+        ep[0 + p] = (levels - 1)*(Atb1[p] * Cyy - Atb2[p] * Cxy)*scale;
+        ep[4 + p] = (levels - 1)*(Atb2[p] * Cxx - Atb1[p] * Cxy)*scale;
+    }
+
+    if (abs(Cxx*Cyy - Cxy*Cxy) < 0.001)
+    {
+        // flatten
+        for (int p = 0; p < channels; p++)
+        {
+            ep[0 + p] = sum[p] / sum[4];
+            ep[4 + p] = ep[0 + p];
+        }
+    }
+
+    for (uniform int p = 0; p < channels; p++)
+    {
+        endpoints[2 * p + 0] = ep[0 + p];
+        endpoints[2 * p + 1] = ep[4 + p];
+    }
+}
+
+void ls_refine(float endpoints[], float scaled_pixels[], astc_block block[])
+{
+    if (block->color_endpoint_modes[0] % 4 == 2)
+    {
+        ls_refine_scale(endpoints, scaled_pixels, block);
+    }
+    else
+    {
+        ls_refine_pair(endpoints, scaled_pixels, block);
+    }
+}
+
+float optimize_alt_plane(uint8_t alt_weights[], float scaled_pixels[], astc_block block[])
+{
+    int ccs = block->color_component_selector;
+
+    float ext[2] = { 1000, -1000 };
+
+    for (uniform int y = 0; y < block->height; y++)
+    for (uniform int x = 0; x < block->width; x++)
+    {
+        float value = get_pixel(scaled_pixels, 3, x, y);
+        ext[0] = min(ext[0], value);
+        ext[1] = max(ext[1], value);
+    }
+    
+    block->endpoints[3 * 2 + 0] = 0;
+    block->endpoints[3 * 2 + 1] = 255;
+
+    float _rec_endpoints[8];
+    dequant_decode_endpoints(_rec_endpoints, block->endpoints, block->color_endpoint_modes[0], block->endpoint_range);
+    
+    float endpoints[8];
+    for (int p = 0; p < 3; p++)
+    {
+        endpoints[p * 2 + 0] = _rec_endpoints[0 + p];
+        endpoints[p * 2 + 1] = _rec_endpoints[4 + p];
+    }
+
+    endpoints[3 * 2 + 0] = gather_float(endpoints, ccs * 2 + 0);
+    endpoints[3 * 2 + 1] = gather_float(endpoints, ccs * 2 + 1);
+
+    scatter_float(endpoints, ccs * 2 + 0, ext[0]);
+    scatter_float(endpoints, ccs * 2 + 1, ext[1]);
+    
+    quantize_endpoints(block, endpoints);
+
+    float rec_endpoints[8];
+    dequant_decode_endpoints(rec_endpoints, block->endpoints, block->color_endpoint_modes[0], block->endpoint_range);
+
+    float base = gather_float(rec_endpoints, 0 + ccs);
+    float dir = gather_float(rec_endpoints, 4 + ccs) - base;
+    float sq_norm = sq(dir) + 0.00001;
+    
+    int w_levels = get_levels(block->weight_range);
+    dir *= (w_levels - 1) / sq_norm;
+
+    float err = 0;
+    for (uniform int y = 0; y < block->height; y++)
+    for (uniform int x = 0; x < block->width; x++)
+    {
+        float value = get_pixel(scaled_pixels, 3, x, y) - base;
+
+        int q = clamp(value * dir + 0.5, 0, w_levels - 1);
+
+        alt_weights[y * block->width + x] = q;
+    }
+
+    if (dir < 0)
+    for (uniform int y = 0; y < block->height; y++)
+    for (uniform int x = 0; x < block->width; x++)
+    {
+        int q = block->weights[y * block->width + x];
+
+        block->weights[y * block->width + x] = w_levels - 1 - q;
+    }
+
+    return err;
+}
+
+void optimize_block(float scaled_pixels[], astc_block block[], astc_enc_state state[])
+{
+    pixel_set pset;
+    pset.pixels = scaled_pixels;
+    pset.width = block->width;
+    pset.height = block->height;
+
+    float ep[8];
+    bool zero_based = (block->color_endpoint_modes[0] % 4) == 2;
+    compute_pca_endpoints(ep, &pset, zero_based, 4);
+
+    quantize_endpoints(block, ep);
+    opt_weights(scaled_pixels, block);
+    
+    for (uniform int i = 0; i < state->refineIterations; i++)
+    {
+        ls_refine(ep, scaled_pixels, block);
+        quantize_endpoints(block, ep);
+        opt_weights(scaled_pixels, block);
+    }
+    
+    if (block->dual_plane)
+    {
+        uint8_t alt_weights[64];
+        optimize_alt_plane(alt_weights, scaled_pixels, block);
+
+        uint8_t block_weights[64];
+        for (uniform int i = 0; i < block->width * block->height; i++)
+        {
+            block_weights[i] = block->weights[i];
+        }
+
+        for (uniform int i = 0; i < block->width * block->height; i++)
+        {
+            block->weights[i * 2 + 0] = block_weights[i];
+            block->weights[i * 2 + 1] = alt_weights[i];
+        }
+    }
+}
+
+float measure_error(astc_block block[], astc_enc_state state[])
+{
+    uniform int pitch = state->block_height * state->block_width;
+    assert(pitch <= 64);
+
+    // dequant values    
+    uniform int num_weights = block->width * block->height * (block->dual_plane ? 2 : 1);
+
+    range_values weight_range_values = get_range_values(block->weight_range);
+
+    int block_weights[64];
+    for (int i = 0; i < num_weights; i++)
+    {
+        block_weights[i] = ((int)block->weights[i] * 64.0f / (weight_range_values.levels - 1) + 0.5);
+    }
+
+    float rgba_endpoints[8];
+    dequant_decode_endpoints(rgba_endpoints, block->endpoints, block->color_endpoint_modes[0], block->endpoint_range);
+    
+    uniform int stride = block->width;
+    uniform int Ds = (1024 + state->block_width / 2) / (state->block_width - 1);
+    uniform int Dt = (1024 + state->block_height / 2) / (state->block_height - 1);
+
+    uint8_t main_weights[64];
+    uint8_t alt_weights[64];
+
+    for (uniform int i = 0; i < num_weights; i++) main_weights[i] = block_weights[i];
+
+    if (block->dual_plane)
+    for (uniform int i = 0; i < num_weights/2; i++)
+    {
+        main_weights[i] = block_weights[i * 2 + 0];
+        alt_weights[i] = block_weights[i * 2 + 1];
+    }
+
+    float sq_error = 0;
+
+    for (uniform int y = 0; y < state->block_height; y++)
+    for (uniform int x = 0; x < state->block_width; x++)
+    {
+        uniform int gs = (x * Ds * (block->width  - 1) + 32) >> 6;
+        uniform int gt = (y * Dt * (block->height - 1) + 32) >> 6;
+
+        uniform int js = gs >> 4;
+        uniform int jt = gt >> 4;
+
+        uniform int fs = gs & 0x0F;
+        uniform int ft = gt & 0x0F;
+        uniform int w11 = ((fs*ft + 8) >> 4);
+
+        int filled_weight = 0;
+        int alt_filled_weight = 0;
+
+        {
+            int acc = 0;
+            acc += main_weights[stride * (jt + 0) + js + 0] * (16 - ft - fs + w11);
+            acc += main_weights[stride * (jt + 0) + js + 1] * (fs - w11);
+            acc += main_weights[stride * (jt + 1) + js + 0] * (ft - w11);
+            acc += main_weights[stride * (jt + 1) + js + 1] * w11;
+            filled_weight = (acc + 8) >> 4;
+        }
+        
+        if (block->dual_plane)
+        {
+            int acc = 0;
+            acc += alt_weights[stride * (jt + 0) + js + 0] * (16 - ft - fs + w11);
+            acc += alt_weights[stride * (jt + 0) + js + 1] * (fs - w11);
+            acc += alt_weights[stride * (jt + 1) + js + 0] * (ft - w11);
+            acc += alt_weights[stride * (jt + 1) + js + 1] * w11;
+            alt_filled_weight = (acc + 8) >> 4;
+        }
+
+        for (uniform int p = 0; p < block->channels; p++)
+        {
+            int C0 = rgba_endpoints[0 + p] * 256 + 128;
+            int C1 = rgba_endpoints[4 + p] * 256 + 128;
+            int w = filled_weight;
+
+            if (block->dual_plane && block->color_component_selector == p)
+            {
+                w = alt_filled_weight;
+            }
+
+            int C = (C0 * (64 - w) + C1 * w + 32) / 64;
+
+            float diff = (C >> 8) - get_pixel(state->pixels, p, x, y);
+            sq_error += diff * diff;
+        }
+    }
+
+    return sq_error;
+}
+
+int code_value(int value, range_values range)
+{
+    int coded = value;
+
+    if (range.levels_m != 2 && range.levels > 5)
+    {
+        int value2 = value;
+        if (value >= range.levels / 2) value2 = (range.levels - 1) - value;
+        int q = (value2 * range.levels_m_rcp) >> 16;
+        int r = value2 - q * range.levels_m;
+        coded = q + r * (1 << (range.levels_e - 1));
+        coded = coded * 2 + ((value >= range.levels / 2) ? 1 : 0);
+    }
+
+    return coded;
+}
+
+void code_block(astc_block block[])
+{
+    uniform int num_weights = block->width * block->height * (block->dual_plane ? 2 : 1);
+
+    range_values weight_range_values = get_range_values(block->weight_range);
+    for (uniform int i = 0; i < num_weights; i++)
+    {
+        block->weights[i] = code_value(block->weights[i], weight_range_values);
+    }
+
+    range_values endpoint_range_values = get_range_values(block->endpoint_range);
+    for (uniform int i = 0; i < 2 * block->color_endpoint_pairs; i++)
+    {
+        block->endpoints[i] = code_value(block->endpoints[i], endpoint_range_values);
+    }
+}
+
+extern "C" void pack_block_c(uniform uint32_t data[4], uniform astc_block block[]);
+
+void pack_block(astc_block block[], astc_enc_state state[])
+{
+    code_block(block);
+
+    foreach_active (instance) 
+    {
+        uniform astc_block ublock;
+
+        ublock.width = block->width;
+        ublock.height = block->height;
+        ublock.dual_plane = block->dual_plane;
+        ublock.partitions = block->partitions;
+        ublock.color_endpoint_pairs = block->color_endpoint_pairs;
+
+        ublock.weight_range = extract(block->weight_range, instance);
+        ublock.color_component_selector = extract(block->color_component_selector, instance);
+        ublock.partition_id = extract(block->partition_id, instance);
+        ublock.endpoint_range = extract(block->endpoint_range, instance);
+        ublock.color_endpoint_modes[0] = extract(block->color_endpoint_modes[0], instance);
+
+        uniform int num_weights = block->width * block->height * (block->dual_plane ? 2 : 1);
+        for (uniform int i = 0; i < num_weights; i++)
+            ublock.weights[i] = extract(block->weights[i], instance);
+
+        for (uniform int i = 0; i < 8; i++)
+            ublock.endpoints[i] = extract(block->endpoints[i], instance);
+
+        uniform uint32_t data[4];
+        pack_block_c(data, &ublock);
+        
+        for (uniform int i = 0; i < 4; i++) state->data[i] = insert(state->data[i], instance, data[i]);
+    }
+}
+
+int get_bits(uint32_t value, uniform int from, uniform int to)
+{
+    return (value >> from) & ((1 << (to + 1 - from)) - 1);
+}
+
+void load_block_parameters(astc_block block[], uint32_t mode, uniform astc_enc_context ctx[])
+{
+    // uniform parameters
+    block->width = ctx->width;
+    block->height = ctx->height;
+    block->dual_plane = ctx->dual_plane;
+    block->partitions = ctx->partitions;
+    block->color_endpoint_pairs = ctx->color_endpoint_pairs;
+    block->channels = ctx->channels;
+
+    // varying parameters
+    block->weight_range = get_bits(mode, 0, 3);  // 0..11 <= 2^4
+    block->color_component_selector = get_bits(mode, 4, 5);  // 0..2 <= 2^2 
+    block->partition_id = 0;
+    block->color_endpoint_modes[0] = get_bits(mode, 6, 7) * 2 + 6; // 6, 8, 10 or 12
+    block->endpoint_range = get_bits(mode, 8, 12); // 0..20 <= 2^5
+}
+
+export void astc_encode_ispc(uniform rgba_surface src[], uniform float block_scores[], uniform uint8_t dst[], uniform uint64_t list[], uniform astc_enc_context list_context[], uniform astc_enc_settings settings[])
+{
+    uint64_t entry = list[programIndex];
+    uint32_t offset = entry >> 32;
+    uint32_t mode = (entry & 0xFFFFFFFF);
+    if (mode == 0) return;
+    int yy = offset >> 16;
+    int xx = offset & 0xFFFF;
+
+    int tex_width = src->width / settings->block_width;
+
+    astc_enc_state _state;
+    varying astc_enc_state* uniform state = &_state;
+
+    state->block_width = settings->block_width;
+    state->block_height = settings->block_height;
+    state->refineIterations = settings->refineIterations;
+
+    load_block_interleaved(state->pixels, src, xx, yy, state->block_width, state->block_height);
+
+    astc_block _block;
+    varying astc_block* uniform block = &_block;
+
+    load_block_parameters(block, mode, list_context);
+    
+    scale_pixels(state, list_context);
+    if (block->channels == 3) clear_alpha(state->scaled_pixels, block->width, block->height);
+
+    if (block->dual_plane)
+    {
+        pixel_set pset;
+        pset.pixels = state->scaled_pixels;
+        pset.width = block->width;
+        pset.height = block->height;
+
+        rotate_plane(&pset, block->color_component_selector);
+    }
+
+    optimize_block(state->scaled_pixels, block, state);
+    float error = measure_error(block, state);
+    
+    if (error < gather_float(block_scores, yy * tex_width + xx))
+    {
+        pack_block(block, state);
+
+        scatter_float(block_scores, yy * tex_width + xx, error);
+
+        for (uniform int i = 0; i < 4; i++)
+            scatter_uint((uint32_t*)dst, (yy * tex_width + xx) * 4 + i, state->data[i]);
+    }
+}
diff --git a/Source/meson.build b/Source/meson.build
new file mode 100644
index 0000000..cd61c27
--- /dev/null
+++ b/Source/meson.build
@@ -0,0 +1,26 @@
+sources = files([
+  'createdfd.cpp',
+  'HalfFloat.cpp',
+  'Main.cpp',
+  'stb_image_resize.cpp',
+  'stb_image.cpp',
+  'vk2dfd.cpp'
+])
+
+incdirs = include_directories([
+])
+
+dependencies = [
+]
+
+ispc_kernel = custom_target('ipsc_kernel', input: ['ispc_texcomp/kernel.ispc'], output: ['kernel_ispc.o', 'kernel_ispc_avx2.o', 'kernel_ispc_sse4.o', 'kernel_ispc.h'], command: ['ispc', '-O3', '--arch=x86_64', '--target=sse4,avx2', '--opt=fast-math', '--pic', '@INPUT@', '-h', '@OUTDIR@/kernel_ispc.h', '-o', '@OUTPUT0@'])
+
+ispc_sources = [
+  ispc_kernel,
+  'ispc_texcomp/ispc_texcomp.cpp',
+  'ispc_texcomp/ispc_texcomp.h'
+]
+
+ispc_texcomp = static_library('ispc_texcomp', ispc_sources)
+
+executable('TextureTaffy', sources, dependencies: dependencies, include_directories: incdirs, install: true, install_dir: '', install_tag: 'exe', link_with: ispc_texcomp)
\ No newline at end of file
diff --git a/Source/stb_image.cpp b/Source/stb_image.cpp
new file mode 100644
index 0000000..badb3ef
--- /dev/null
+++ b/Source/stb_image.cpp
@@ -0,0 +1,2 @@
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
\ No newline at end of file
diff --git a/Source/stb_image.h b/Source/stb_image.h
new file mode 100644
index 0000000..5e807a0
--- /dev/null
+++ b/Source/stb_image.h
@@ -0,0 +1,7987 @@
+/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data);
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// provide more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for desired_channels
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+#include <stdlib.h>
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef STBIDEF
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp, pow
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+#ifndef STBI_NO_THREAD_LOCALS
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
+
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
+#endif
+
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#endif
+
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
+}
+#endif
+
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+   int callback_already_read;
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   int ch;
+   fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user) || ferror((FILE *) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
+#endif
+
+static
+#ifdef STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
+#endif
+const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+#ifndef STBI_NO_FAILURE_STRINGS
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+#endif
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+#endif
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+#endif
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two signed shorts is valid, 0 on overflow.
+static int stbi__mul2shorts_valid(short a, short b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load_global = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
+}
+
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
+
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
+      }
+   }
+}
+
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+      bytes += slice_size;
+   }
+}
+#endif
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 8) {
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 16) {
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
+   }
+
+   return result;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n == 0) return;  // already there!
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+#endif
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0;
+   unsigned int code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
+         h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & (sgn - 1));
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      data[0] = (short) (dc * (1 << j->succ_low));
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) * (1 << shift));
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) * 4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0]*4;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
+      return 1;
+   }
+
+   return stbi__err("unknown marker","Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      int x = stbi__get8(j->s);
+      while (x == 255) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
+            m = stbi__get_marker(j);
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
+      } else {
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
+      }
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
+{
+   return (z->zbuffer >= z->zbuffer_end);
+}
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s >= 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         return -1;   /* report error for unexpected end of data. */
+      }
+      stbi__fill_bits(a);
+   }
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   unsigned int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
+      limit *= 2;
+   }
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static const int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static const int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static const int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17) {
+            c = stbi__zreceive(a,3)+3;
+         } else if (c == 18) {
+            c = stbi__zreceive(a,7)+11;
+         } else {
+            return stbi__err("bad codelengths", "Corrupt PNG");
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+
+   for (j=0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride*j;
+      stbi_uc *prior;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define STBI__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            // insert alpha = 255
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+            }
+            // even with SCAN_header, have to scan to see if we have a tRNS
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) {
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
+            }
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth <= 8)
+         ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
+      else
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
+} stbi__bmp_data;
+
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   info->extra_read = 14;
+
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
+
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               stbi__bmp_set_mask_defaults(info, compress);
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               info->extra_read += 12;
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         // V4/V5 header
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - info.extra_read - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+         stbi__skip(s, info.offset - bytes_read_so_far);
+      }
+   }
+
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
+            }
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
+         }
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               unsigned int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
+
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
+   // Create the destination image.
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
+         }
+      }
+   }
+
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history;
+   int flags, bgindex, ratio, transparent, eflags;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[8192];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+   int delay;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+   int idx;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   idx = g->cur_x + g->cur_y;
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;
+
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels;
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
+{
+   int dispose;
+   int first_frame;
+   int pi;
+   int pcount;
+   STBI_NOTUSED(req_comp);
+
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0;
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame.
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1;
+   } else {
+      // second frame - how do we dispose of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2;
+      pcount = g->w * g->h;
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
+            }
+         }
+      } else if (dispose == 2) {
+         // restore what was changed last frame to background before that frame;
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame;
+      memcpy( g->background, g->out, 4 * g->w * g->h );
+   }
+
+   // clear my history;
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
+   for (;;) {
+      int tag = stbi__get8(s);
+      switch (tag) {
+         case 0x2C: /* Image Descriptor */
+         {
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+               g->cur_y = g->max_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (!o) return NULL;
+
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
+                  }
+               }
+            }
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255;
+                  }
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0;
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1);
+                     g->transparent = -1;
+                  }
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0) {
+               stbi__skip(s, len);
+            }
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+}
+
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0;
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0;
+      stbi__gif g;
+      int stride;
+      int out_size = 0;
+      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0;
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers;
+            stride = g.w * g.h * 4;
+
+            if (out) {
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
+
+               if (delays) {
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride );
+            if (layers >= 2) {
+               two_back = out - 2 * stride;
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay;
+            }
+         }
+      } while (u != 0);
+
+      // free temp buffer;
+      STBI_FREE(g.out);
+      STBI_FREE(g.history);
+      STBI_FREE(g.background);
+
+      // do the final conversion after loading everything;
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers;
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type.");
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
+
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames.
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
+   }
+
+   // free buffers needed for multiple frame loading;
+   STBI_FREE(g.history);
+   STBI_FREE(g.background);
+
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
+
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   if (p == NULL) {
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
+      return 0;
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }
+
+   if (req_comp && req_comp != s->img_n) {
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
+   else
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
+   return 0;
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/Source/stb_image_resize.cpp b/Source/stb_image_resize.cpp
new file mode 100644
index 0000000..c5371c2
--- /dev/null
+++ b/Source/stb_image_resize.cpp
@@ -0,0 +1,2 @@
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "stb_image_resize.h"
\ No newline at end of file
diff --git a/Source/stb_image_resize.h b/Source/stb_image_resize.h
new file mode 100644
index 0000000..ef9e6fe
--- /dev/null
+++ b/Source/stb_image_resize.h
@@ -0,0 +1,2634 @@
+/* stb_image_resize - v0.97 - public domain image resizing
+   by Jorge L Rodriguez (@VinoBS) - 2014
+   http://github.com/nothings/stb
+
+   Written with emphasis on usability, portability, and efficiency. (No
+   SIMD or threads, so it be easily outperformed by libs that use those.)
+   Only scaling and translation is supported, no rotations or shears.
+   Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation.
+
+   COMPILING & LINKING
+      In one C/C++ file that #includes this file, do this:
+         #define STB_IMAGE_RESIZE_IMPLEMENTATION
+      before the #include. That will create the implementation in that file.
+
+   QUICKSTART
+      stbir_resize_uint8(      input_pixels , in_w , in_h , 0,
+                               output_pixels, out_w, out_h, 0, num_channels)
+      stbir_resize_float(...)
+      stbir_resize_uint8_srgb( input_pixels , in_w , in_h , 0,
+                               output_pixels, out_w, out_h, 0,
+                               num_channels , alpha_chan  , 0)
+      stbir_resize_uint8_srgb_edgemode(
+                               input_pixels , in_w , in_h , 0,
+                               output_pixels, out_w, out_h, 0,
+                               num_channels , alpha_chan  , 0, STBIR_EDGE_CLAMP)
+                                                            // WRAP/REFLECT/ZERO
+
+   FULL API
+      See the "header file" section of the source for API documentation.
+
+   ADDITIONAL DOCUMENTATION
+
+      SRGB & FLOATING POINT REPRESENTATION
+         The sRGB functions presume IEEE floating point. If you do not have
+         IEEE floating point, define STBIR_NON_IEEE_FLOAT. This will use
+         a slower implementation.
+
+      MEMORY ALLOCATION
+         The resize functions here perform a single memory allocation using
+         malloc. To control the memory allocation, before the #include that
+         triggers the implementation, do:
+
+            #define STBIR_MALLOC(size,context) ...
+            #define STBIR_FREE(ptr,context)   ...
+
+         Each resize function makes exactly one call to malloc/free, so to use
+         temp memory, store the temp memory in the context and return that.
+
+      ASSERT
+         Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
+
+      OPTIMIZATION
+         Define STBIR_SATURATE_INT to compute clamp values in-range using
+         integer operations instead of float operations. This may be faster
+         on some platforms.
+
+      DEFAULT FILTERS
+         For functions which don't provide explicit control over what filters
+         to use, you can change the compile-time defaults with
+
+            #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
+            #define STBIR_DEFAULT_FILTER_DOWNSAMPLE   STBIR_FILTER_something
+
+         See stbir_filter in the header-file section for the list of filters.
+
+      NEW FILTERS
+         A number of 1D filter kernels are used. For a list of
+         supported filters see the stbir_filter enum. To add a new filter,
+         write a filter function and add it to stbir__filter_info_table.
+
+      PROGRESS
+         For interactive use with slow resize operations, you can install
+         a progress-report callback:
+
+            #define STBIR_PROGRESS_REPORT(val)   some_func(val)
+
+         The parameter val is a float which goes from 0 to 1 as progress is made.
+
+         For example:
+
+            static void my_progress_report(float progress);
+            #define STBIR_PROGRESS_REPORT(val) my_progress_report(val)
+
+            #define STB_IMAGE_RESIZE_IMPLEMENTATION
+            #include "stb_image_resize.h"
+
+            static void my_progress_report(float progress)
+            {
+               printf("Progress: %f%%\n", progress*100);
+            }
+
+      MAX CHANNELS
+         If your image has more than 64 channels, define STBIR_MAX_CHANNELS
+         to the max you'll have.
+
+      ALPHA CHANNEL
+         Most of the resizing functions provide the ability to control how
+         the alpha channel of an image is processed. The important things
+         to know about this:
+
+         1. The best mathematically-behaved version of alpha to use is
+         called "premultiplied alpha", in which the other color channels
+         have had the alpha value multiplied in. If you use premultiplied
+         alpha, linear filtering (such as image resampling done by this
+         library, or performed in texture units on GPUs) does the "right
+         thing". While premultiplied alpha is standard in the movie CGI
+         industry, it is still uncommon in the videogame/real-time world.
+
+         If you linearly filter non-premultiplied alpha, strange effects
+         occur. (For example, the 50/50 average of 99% transparent bright green
+         and 1% transparent black produces 50% transparent dark green when
+         non-premultiplied, whereas premultiplied it produces 50%
+         transparent near-black. The former introduces green energy
+         that doesn't exist in the source image.)
+
+         2. Artists should not edit premultiplied-alpha images; artists
+         want non-premultiplied alpha images. Thus, art tools generally output
+         non-premultiplied alpha images.
+
+         3. You will get best results in most cases by converting images
+         to premultiplied alpha before processing them mathematically.
+
+         4. If you pass the flag STBIR_FLAG_ALPHA_PREMULTIPLIED, the
+         resizer does not do anything special for the alpha channel;
+         it is resampled identically to other channels. This produces
+         the correct results for premultiplied-alpha images, but produces
+         less-than-ideal results for non-premultiplied-alpha images.
+
+         5. If you do not pass the flag STBIR_FLAG_ALPHA_PREMULTIPLIED,
+         then the resizer weights the contribution of input pixels
+         based on their alpha values, or, equivalently, it multiplies
+         the alpha value into the color channels, resamples, then divides
+         by the resultant alpha value. Input pixels which have alpha=0 do
+         not contribute at all to output pixels unless _all_ of the input
+         pixels affecting that output pixel have alpha=0, in which case
+         the result for that pixel is the same as it would be without
+         STBIR_FLAG_ALPHA_PREMULTIPLIED. However, this is only true for
+         input images in integer formats. For input images in float format,
+         input pixels with alpha=0 have no effect, and output pixels
+         which have alpha=0 will be 0 in all channels. (For float images,
+         you can manually achieve the same result by adding a tiny epsilon
+         value to the alpha channel of every image, and then subtracting
+         or clamping it at the end.)
+
+         6. You can suppress the behavior described in #5 and make
+         all-0-alpha pixels have 0 in all channels by #defining
+         STBIR_NO_ALPHA_EPSILON.
+
+         7. You can separately control whether the alpha channel is
+         interpreted as linear or affected by the colorspace. By default
+         it is linear; you almost never want to apply the colorspace.
+         (For example, graphics hardware does not apply sRGB conversion
+         to the alpha channel.)
+
+   CONTRIBUTORS
+      Jorge L Rodriguez: Implementation
+      Sean Barrett: API design, optimizations
+      Aras Pranckevicius: bugfix
+      Nathan Reed: warning fixes
+
+   REVISIONS
+      0.97 (2020-02-02) fixed warning
+      0.96 (2019-03-04) fixed warnings
+      0.95 (2017-07-23) fixed warnings
+      0.94 (2017-03-18) fixed warnings
+      0.93 (2017-03-03) fixed bug with certain combinations of heights
+      0.92 (2017-01-02) fix integer overflow on large (>2GB) images
+      0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
+      0.90 (2014-09-17) first released version
+
+   LICENSE
+     See end of file for license information.
+
+   TODO
+      Don't decode all of the image data when only processing a partial tile
+      Don't use full-width decode buffers when only processing a partial tile
+      When processing wide images, break processing into tiles so data fits in L1 cache
+      Installable filters?
+      Resize that respects alpha test coverage
+         (Reference code: FloatImage::alphaTestCoverage and FloatImage::scaleAlphaToCoverage:
+         https://code.google.com/p/nvidia-texture-tools/source/browse/trunk/src/nvimage/FloatImage.cpp )
+*/
+
+#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+#define STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+
+#ifdef _MSC_VER
+typedef unsigned char  stbir_uint8;
+typedef unsigned short stbir_uint16;
+typedef unsigned int   stbir_uint32;
+#else
+#include <stdint.h>
+typedef uint8_t  stbir_uint8;
+typedef uint16_t stbir_uint16;
+typedef uint32_t stbir_uint32;
+#endif
+
+#ifndef STBIRDEF
+#ifdef STB_IMAGE_RESIZE_STATIC
+#define STBIRDEF static
+#else
+#ifdef __cplusplus
+#define STBIRDEF extern "C"
+#else
+#define STBIRDEF extern
+#endif
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Easy-to-use API:
+//
+//     * "input pixels" points to an array of image data with 'num_channels' channels (e.g. RGB=3, RGBA=4)
+//     * input_w is input image width (x-axis), input_h is input image height (y-axis)
+//     * stride is the offset between successive rows of image data in memory, in bytes. you can
+//       specify 0 to mean packed continuously in memory
+//     * alpha channel is treated identically to other channels.
+//     * colorspace is linear or sRGB as specified by function name
+//     * returned result is 1 for success or 0 in case of an error.
+//       #define STBIR_ASSERT() to trigger an assert on parameter validation errors.
+//     * Memory required grows approximately linearly with input and output size, but with
+//       discontinuities at input_w == output_w and input_h == output_h.
+//     * These functions use a "default" resampling filter defined at compile time. To change the filter,
+//       you can change the compile-time defaults by #defining STBIR_DEFAULT_FILTER_UPSAMPLE
+//       and STBIR_DEFAULT_FILTER_DOWNSAMPLE, or you can use the medium-complexity API.
+
+STBIRDEF int stbir_resize_uint8(     const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels);
+
+STBIRDEF int stbir_resize_float(     const float *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels);
+
+
+// The following functions interpret image data as gamma-corrected sRGB.
+// Specify STBIR_ALPHA_CHANNEL_NONE if you have no alpha channel,
+// or otherwise provide the index of the alpha channel. Flags value
+// of 0 will probably do the right thing if you're not sure what
+// the flags mean.
+
+#define STBIR_ALPHA_CHANNEL_NONE       -1
+
+// Set this flag if your texture has premultiplied alpha. Otherwise, stbir will
+// use alpha-weighted resampling (effectively premultiplying, resampling,
+// then unpremultiplying).
+#define STBIR_FLAG_ALPHA_PREMULTIPLIED    (1 << 0)
+// The specified alpha channel should be handled as gamma-corrected value even
+// when doing sRGB operations.
+#define STBIR_FLAG_ALPHA_USES_COLORSPACE  (1 << 1)
+
+STBIRDEF int stbir_resize_uint8_srgb(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels, int alpha_channel, int flags);
+
+
+typedef enum
+{
+    STBIR_EDGE_CLAMP   = 1,
+    STBIR_EDGE_REFLECT = 2,
+    STBIR_EDGE_WRAP    = 3,
+    STBIR_EDGE_ZERO    = 4,
+} stbir_edge;
+
+// This function adds the ability to specify how requests to sample off the edge of the image are handled.
+STBIRDEF int stbir_resize_uint8_srgb_edgemode(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                                    unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                              int num_channels, int alpha_channel, int flags,
+                                              stbir_edge edge_wrap_mode);
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Medium-complexity API
+//
+// This extends the easy-to-use API as follows:
+//
+//     * Alpha-channel can be processed separately
+//       * If alpha_channel is not STBIR_ALPHA_CHANNEL_NONE
+//         * Alpha channel will not be gamma corrected (unless flags&STBIR_FLAG_GAMMA_CORRECT)
+//         * Filters will be weighted by alpha channel (unless flags&STBIR_FLAG_ALPHA_PREMULTIPLIED)
+//     * Filter can be selected explicitly
+//     * uint16 image type
+//     * sRGB colorspace available for all types
+//     * context parameter for passing to STBIR_MALLOC
+
+typedef enum
+{
+    STBIR_FILTER_DEFAULT      = 0,  // use same filter type that easy-to-use API chooses
+    STBIR_FILTER_BOX          = 1,  // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
+    STBIR_FILTER_TRIANGLE     = 2,  // On upsampling, produces same results as bilinear texture filtering
+    STBIR_FILTER_CUBICBSPLINE = 3,  // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
+    STBIR_FILTER_CATMULLROM   = 4,  // An interpolating cubic spline
+    STBIR_FILTER_MITCHELL     = 5,  // Mitchell-Netrevalli filter with B=1/3, C=1/3
+} stbir_filter;
+
+typedef enum
+{
+    STBIR_COLORSPACE_LINEAR,
+    STBIR_COLORSPACE_SRGB,
+
+    STBIR_MAX_COLORSPACES,
+} stbir_colorspace;
+
+// The following functions are all identical except for the type of the image data
+
+STBIRDEF int stbir_resize_uint8_generic( const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                               unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space,
+                                         void *alloc_context);
+
+STBIRDEF int stbir_resize_uint16_generic(const stbir_uint16 *input_pixels  , int input_w , int input_h , int input_stride_in_bytes,
+                                               stbir_uint16 *output_pixels , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space,
+                                         void *alloc_context);
+
+STBIRDEF int stbir_resize_float_generic( const float *input_pixels         , int input_w , int input_h , int input_stride_in_bytes,
+                                               float *output_pixels        , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space,
+                                         void *alloc_context);
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Full-complexity API
+//
+// This extends the medium API as follows:
+//
+//       * uint32 image type
+//     * not typesafe
+//     * separate filter types for each axis
+//     * separate edge modes for each axis
+//     * can specify scale explicitly for subpixel correctness
+//     * can specify image source tile using texture coordinates
+
+typedef enum
+{
+    STBIR_TYPE_UINT8 ,
+    STBIR_TYPE_UINT16,
+    STBIR_TYPE_UINT32,
+    STBIR_TYPE_FLOAT ,
+
+    STBIR_MAX_TYPES
+} stbir_datatype;
+
+STBIRDEF int stbir_resize(         const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical,
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context);
+
+STBIRDEF int stbir_resize_subpixel(const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical,
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float x_scale, float y_scale,
+                                   float x_offset, float y_offset);
+
+STBIRDEF int stbir_resize_region(  const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical,
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float s0, float t0, float s1, float t1);
+// (s0, t0) & (s1, t1) are the top-left and bottom right corner (uv addressing style: [0, 1]x[0, 1]) of a region of the input image to use.
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+
+
+
+
+
+#ifdef STB_IMAGE_RESIZE_IMPLEMENTATION
+
+#ifndef STBIR_ASSERT
+#include <assert.h>
+#define STBIR_ASSERT(x) assert(x)
+#endif
+
+// For memset
+#include <string.h>
+
+#include <math.h>
+
+#ifndef STBIR_MALLOC
+#include <stdlib.h>
+// use comma operator to evaluate c, to avoid "unused parameter" warnings
+#define STBIR_MALLOC(size,c) ((void)(c), malloc(size))
+#define STBIR_FREE(ptr,c)    ((void)(c), free(ptr))
+#endif
+
+#ifndef _MSC_VER
+#ifdef __cplusplus
+#define stbir__inline inline
+#else
+#define stbir__inline
+#endif
+#else
+#define stbir__inline __forceinline
+#endif
+
+
+// should produce compiler error if size is wrong
+typedef unsigned char stbir__validate_uint32[sizeof(stbir_uint32) == 4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBIR__NOTUSED(v)  (void)(v)
+#else
+#define STBIR__NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
+
+#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
+#define STBIR_DEFAULT_FILTER_UPSAMPLE    STBIR_FILTER_CATMULLROM
+#endif
+
+#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
+#define STBIR_DEFAULT_FILTER_DOWNSAMPLE  STBIR_FILTER_MITCHELL
+#endif
+
+#ifndef STBIR_PROGRESS_REPORT
+#define STBIR_PROGRESS_REPORT(float_0_to_1)
+#endif
+
+#ifndef STBIR_MAX_CHANNELS
+#define STBIR_MAX_CHANNELS 64
+#endif
+
+#if STBIR_MAX_CHANNELS > 65536
+#error "Too many channels; STBIR_MAX_CHANNELS must be no more than 65536."
+// because we store the indices in 16-bit variables
+#endif
+
+// This value is added to alpha just before premultiplication to avoid
+// zeroing out color values. It is equivalent to 2^-80. If you don't want
+// that behavior (it may interfere if you have floating point images with
+// very small alpha values) then you can define STBIR_NO_ALPHA_EPSILON to
+// disable it.
+#ifndef STBIR_ALPHA_EPSILON
+#define STBIR_ALPHA_EPSILON ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
+#endif
+
+
+
+#ifdef _MSC_VER
+#define STBIR__UNUSED_PARAM(v)  (void)(v)
+#else
+#define STBIR__UNUSED_PARAM(v)  (void)sizeof(v)
+#endif
+
+// must match stbir_datatype
+static unsigned char stbir__type_size[] = {
+    1, // STBIR_TYPE_UINT8
+    2, // STBIR_TYPE_UINT16
+    4, // STBIR_TYPE_UINT32
+    4, // STBIR_TYPE_FLOAT
+};
+
+// Kernel function centered at 0
+typedef float (stbir__kernel_fn)(float x, float scale);
+typedef float (stbir__support_fn)(float scale);
+
+typedef struct
+{
+    stbir__kernel_fn* kernel;
+    stbir__support_fn* support;
+} stbir__filter_info;
+
+// When upsampling, the contributors are which source pixels contribute.
+// When downsampling, the contributors are which destination pixels are contributed to.
+typedef struct
+{
+    int n0; // First contributing pixel
+    int n1; // Last contributing pixel
+} stbir__contributors;
+
+typedef struct
+{
+    const void* input_data;
+    int input_w;
+    int input_h;
+    int input_stride_bytes;
+
+    void* output_data;
+    int output_w;
+    int output_h;
+    int output_stride_bytes;
+
+    float s0, t0, s1, t1;
+
+    float horizontal_shift; // Units: output pixels
+    float vertical_shift;   // Units: output pixels
+    float horizontal_scale;
+    float vertical_scale;
+
+    int channels;
+    int alpha_channel;
+    stbir_uint32 flags;
+    stbir_datatype type;
+    stbir_filter horizontal_filter;
+    stbir_filter vertical_filter;
+    stbir_edge edge_horizontal;
+    stbir_edge edge_vertical;
+    stbir_colorspace colorspace;
+
+    stbir__contributors* horizontal_contributors;
+    float* horizontal_coefficients;
+
+    stbir__contributors* vertical_contributors;
+    float* vertical_coefficients;
+
+    int decode_buffer_pixels;
+    float* decode_buffer;
+
+    float* horizontal_buffer;
+
+    // cache these because ceil/floor are inexplicably showing up in profile
+    int horizontal_coefficient_width;
+    int vertical_coefficient_width;
+    int horizontal_filter_pixel_width;
+    int vertical_filter_pixel_width;
+    int horizontal_filter_pixel_margin;
+    int vertical_filter_pixel_margin;
+    int horizontal_num_contributors;
+    int vertical_num_contributors;
+
+    int ring_buffer_length_bytes;   // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
+    int ring_buffer_num_entries;    // Total number of entries in the ring buffer.
+    int ring_buffer_first_scanline;
+    int ring_buffer_last_scanline;
+    int ring_buffer_begin_index;    // first_scanline is at this index in the ring buffer
+    float* ring_buffer;
+
+    float* encode_buffer; // A temporary buffer to store floats so we don't lose precision while we do multiply-adds.
+
+    int horizontal_contributors_size;
+    int horizontal_coefficients_size;
+    int vertical_contributors_size;
+    int vertical_coefficients_size;
+    int decode_buffer_size;
+    int horizontal_buffer_size;
+    int ring_buffer_size;
+    int encode_buffer_size;
+} stbir__info;
+
+
+static const float stbir__max_uint8_as_float  = 255.0f;
+static const float stbir__max_uint16_as_float = 65535.0f;
+static const double stbir__max_uint32_as_float = 4294967295.0;
+
+
+static stbir__inline int stbir__min(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+static stbir__inline float stbir__saturate(float x)
+{
+    if (x < 0)
+        return 0;
+
+    if (x > 1)
+        return 1;
+
+    return x;
+}
+
+#ifdef STBIR_SATURATE_INT
+static stbir__inline stbir_uint8 stbir__saturate8(int x)
+{
+    if ((unsigned int) x <= 255)
+        return x;
+
+    if (x < 0)
+        return 0;
+
+    return 255;
+}
+
+static stbir__inline stbir_uint16 stbir__saturate16(int x)
+{
+    if ((unsigned int) x <= 65535)
+        return x;
+
+    if (x < 0)
+        return 0;
+
+    return 65535;
+}
+#endif
+
+static float stbir__srgb_uchar_to_linear_float[256] = {
+    0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
+    0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
+    0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
+    0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
+    0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
+    0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
+    0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
+    0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
+    0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
+    0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
+    0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
+    0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
+    0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
+    0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
+    0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
+    0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
+    0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
+    0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
+    0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
+    0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
+    0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
+    0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
+    0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
+    0.982251f, 0.991102f, 1.0f
+};
+
+static float stbir__srgb_to_linear(float f)
+{
+    if (f <= 0.04045f)
+        return f / 12.92f;
+    else
+        return (float)pow((f + 0.055f) / 1.055f, 2.4f);
+}
+
+static float stbir__linear_to_srgb(float f)
+{
+    if (f <= 0.0031308f)
+        return f * 12.92f;
+    else
+        return 1.055f * (float)pow(f, 1 / 2.4f) - 0.055f;
+}
+
+#ifndef STBIR_NON_IEEE_FLOAT
+// From https://gist.github.com/rygorous/2203834
+
+typedef union
+{
+    stbir_uint32 u;
+    float f;
+} stbir__FP32;
+
+static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
+    0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+    0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+    0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+    0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+    0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+    0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+    0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+    0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+    0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+    0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+    0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+    0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+    0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+};
+
+static stbir_uint8 stbir__linear_to_srgb_uchar(float in)
+{
+    static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
+    static const stbir__FP32 minval = { (127-13) << 23 };
+    stbir_uint32 tab,bias,scale,t;
+    stbir__FP32 f;
+
+    // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+    // The tests are carefully written so that NaNs map to 0, same as in the reference
+    // implementation.
+    if (!(in > minval.f)) // written this way to catch NaNs
+        in = minval.f;
+    if (in > almostone.f)
+        in = almostone.f;
+
+    // Do the table lookup and unpack bias, scale
+    f.f = in;
+    tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
+    bias = (tab >> 16) << 9;
+    scale = tab & 0xffff;
+
+    // Grab next-highest mantissa bits and perform linear interpolation
+    t = (f.u >> 12) & 0xff;
+    return (unsigned char) ((bias + scale*t) >> 16);
+}
+
+#else
+// sRGB transition values, scaled by 1<<28
+static int stbir__srgb_offset_to_linear_scaled[256] =
+{
+            0,     40738,    122216,    203693,    285170,    366648,    448125,    529603,
+       611080,    692557,    774035,    855852,    942009,   1033024,   1128971,   1229926,
+      1335959,   1447142,   1563542,   1685229,   1812268,   1944725,   2082664,   2226148,
+      2375238,   2529996,   2690481,   2856753,   3028870,   3206888,   3390865,   3580856,
+      3776916,   3979100,   4187460,   4402049,   4622919,   4850123,   5083710,   5323731,
+      5570236,   5823273,   6082892,   6349140,   6622065,   6901714,   7188133,   7481369,
+      7781466,   8088471,   8402427,   8723380,   9051372,   9386448,   9728650,  10078021,
+     10434603,  10798439,  11169569,  11548036,  11933879,  12327139,  12727857,  13136073,
+     13551826,  13975156,  14406100,  14844697,  15290987,  15745007,  16206795,  16676389,
+     17153826,  17639142,  18132374,  18633560,  19142734,  19659934,  20185196,  20718552,
+     21260042,  21809696,  22367554,  22933648,  23508010,  24090680,  24681686,  25281066,
+     25888850,  26505076,  27129772,  27762974,  28404716,  29055026,  29713942,  30381490,
+     31057708,  31742624,  32436272,  33138682,  33849884,  34569912,  35298800,  36036568,
+     36783260,  37538896,  38303512,  39077136,  39859796,  40651528,  41452360,  42262316,
+     43081432,  43909732,  44747252,  45594016,  46450052,  47315392,  48190064,  49074096,
+     49967516,  50870356,  51782636,  52704392,  53635648,  54576432,  55526772,  56486700,
+     57456236,  58435408,  59424248,  60422780,  61431036,  62449032,  63476804,  64514376,
+     65561776,  66619028,  67686160,  68763192,  69850160,  70947088,  72053992,  73170912,
+     74297864,  75434880,  76581976,  77739184,  78906536,  80084040,  81271736,  82469648,
+     83677792,  84896192,  86124888,  87363888,  88613232,  89872928,  91143016,  92423512,
+     93714432,  95015816,  96327688,  97650056,  98982952, 100326408, 101680440, 103045072,
+    104420320, 105806224, 107202800, 108610064, 110028048, 111456776, 112896264, 114346544,
+    115807632, 117279552, 118762328, 120255976, 121760536, 123276016, 124802440, 126339832,
+    127888216, 129447616, 131018048, 132599544, 134192112, 135795792, 137410592, 139036528,
+    140673648, 142321952, 143981456, 145652208, 147334208, 149027488, 150732064, 152447968,
+    154175200, 155913792, 157663776, 159425168, 161197984, 162982240, 164777968, 166585184,
+    168403904, 170234160, 172075968, 173929344, 175794320, 177670896, 179559120, 181458992,
+    183370528, 185293776, 187228736, 189175424, 191133888, 193104112, 195086128, 197079968,
+    199085648, 201103184, 203132592, 205173888, 207227120, 209292272, 211369392, 213458480,
+    215559568, 217672656, 219797792, 221934976, 224084240, 226245600, 228419056, 230604656,
+    232802400, 235012320, 237234432, 239468736, 241715280, 243974080, 246245120, 248528464,
+    250824112, 253132064, 255452368, 257785040, 260130080, 262487520, 264857376, 267239664,
+};
+
+static stbir_uint8 stbir__linear_to_srgb_uchar(float f)
+{
+    int x = (int) (f * (1 << 28)); // has headroom so you don't need to clamp
+    int v = 0;
+    int i;
+
+    // Refine the guess with a short binary search.
+    i = v + 128; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  64; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  32; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  16; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   8; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   4; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   2; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   1; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+
+    return (stbir_uint8) v;
+}
+#endif
+
+static float stbir__filter_trapezoid(float x, float scale)
+{
+    float halfscale = scale / 2;
+    float t = 0.5f + halfscale;
+    STBIR_ASSERT(scale <= 1);
+
+    x = (float)fabs(x);
+
+    if (x >= t)
+        return 0;
+    else
+    {
+        float r = 0.5f - halfscale;
+        if (x <= r)
+            return 1;
+        else
+            return (t - x) / scale;
+    }
+}
+
+static float stbir__support_trapezoid(float scale)
+{
+    STBIR_ASSERT(scale <= 1);
+    return 0.5f + scale / 2;
+}
+
+static float stbir__filter_triangle(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x <= 1.0f)
+        return 1 - x;
+    else
+        return 0;
+}
+
+static float stbir__filter_cubic(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return (4 + x*x*(3*x - 6))/6;
+    else if (x < 2.0f)
+        return (8 + x*(-12 + x*(6 - x)))/6;
+
+    return (0.0f);
+}
+
+static float stbir__filter_catmullrom(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return 1 - x*x*(2.5f - 1.5f*x);
+    else if (x < 2.0f)
+        return 2 - x*(4 + x*(0.5f*x - 2.5f));
+
+    return (0.0f);
+}
+
+static float stbir__filter_mitchell(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return (16 + x*x*(21 * x - 36))/18;
+    else if (x < 2.0f)
+        return (32 + x*(-60 + x*(36 - 7*x)))/18;
+
+    return (0.0f);
+}
+
+static float stbir__support_zero(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 0;
+}
+
+static float stbir__support_one(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 1;
+}
+
+static float stbir__support_two(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 2;
+}
+
+static stbir__filter_info stbir__filter_info_table[] = {
+        { NULL,                     stbir__support_zero },
+        { stbir__filter_trapezoid,  stbir__support_trapezoid },
+        { stbir__filter_triangle,   stbir__support_one },
+        { stbir__filter_cubic,      stbir__support_two },
+        { stbir__filter_catmullrom, stbir__support_two },
+        { stbir__filter_mitchell,   stbir__support_two },
+};
+
+stbir__inline static int stbir__use_upsampling(float ratio)
+{
+    return ratio > 1;
+}
+
+stbir__inline static int stbir__use_width_upsampling(stbir__info* stbir_info)
+{
+    return stbir__use_upsampling(stbir_info->horizontal_scale);
+}
+
+stbir__inline static int stbir__use_height_upsampling(stbir__info* stbir_info)
+{
+    return stbir__use_upsampling(stbir_info->vertical_scale);
+}
+
+// This is the maximum number of input samples that can affect an output sample
+// with the given filter
+static int stbir__get_filter_pixel_width(stbir_filter filter, float scale)
+{
+    STBIR_ASSERT(filter != 0);
+    STBIR_ASSERT(filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+
+    if (stbir__use_upsampling(scale))
+        return (int)ceil(stbir__filter_info_table[filter].support(1/scale) * 2);
+    else
+        return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2 / scale);
+}
+
+// This is how much to expand buffers to account for filters seeking outside
+// the image boundaries.
+static int stbir__get_filter_pixel_margin(stbir_filter filter, float scale)
+{
+    return stbir__get_filter_pixel_width(filter, scale) / 2;
+}
+
+static int stbir__get_coefficient_width(stbir_filter filter, float scale)
+{
+    if (stbir__use_upsampling(scale))
+        return (int)ceil(stbir__filter_info_table[filter].support(1 / scale) * 2);
+    else
+        return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2);
+}
+
+static int stbir__get_contributors(float scale, stbir_filter filter, int input_size, int output_size)
+{
+    if (stbir__use_upsampling(scale))
+        return output_size;
+    else
+        return (input_size + stbir__get_filter_pixel_margin(filter, scale) * 2);
+}
+
+static int stbir__get_total_horizontal_coefficients(stbir__info* info)
+{
+    return info->horizontal_num_contributors
+         * stbir__get_coefficient_width      (info->horizontal_filter, info->horizontal_scale);
+}
+
+static int stbir__get_total_vertical_coefficients(stbir__info* info)
+{
+    return info->vertical_num_contributors
+         * stbir__get_coefficient_width      (info->vertical_filter, info->vertical_scale);
+}
+
+static stbir__contributors* stbir__get_contributor(stbir__contributors* contributors, int n)
+{
+    return &contributors[n];
+}
+
+// For perf reasons this code is duplicated in stbir__resample_horizontal_upsample/downsample,
+// if you change it here change it there too.
+static float* stbir__get_coefficient(float* coefficients, stbir_filter filter, float scale, int n, int c)
+{
+    int width = stbir__get_coefficient_width(filter, scale);
+    return &coefficients[width*n + c];
+}
+
+static int stbir__edge_wrap_slow(stbir_edge edge, int n, int max)
+{
+    switch (edge)
+    {
+    case STBIR_EDGE_ZERO:
+        return 0; // we'll decode the wrong pixel here, and then overwrite with 0s later
+
+    case STBIR_EDGE_CLAMP:
+        if (n < 0)
+            return 0;
+
+        if (n >= max)
+            return max - 1;
+
+        return n; // NOTREACHED
+
+    case STBIR_EDGE_REFLECT:
+    {
+        if (n < 0)
+        {
+            if (n < max)
+                return -n;
+            else
+                return max - 1;
+        }
+
+        if (n >= max)
+        {
+            int max2 = max * 2;
+            if (n >= max2)
+                return 0;
+            else
+                return max2 - n - 1;
+        }
+
+        return n; // NOTREACHED
+    }
+
+    case STBIR_EDGE_WRAP:
+        if (n >= 0)
+            return (n % max);
+        else
+        {
+            int m = (-n) % max;
+
+            if (m != 0)
+                m = max - m;
+
+            return (m);
+        }
+        // NOTREACHED
+
+    default:
+        STBIR_ASSERT(!"Unimplemented edge type");
+        return 0;
+    }
+}
+
+stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
+{
+    // avoid per-pixel switch
+    if (n >= 0 && n < max)
+        return n;
+    return stbir__edge_wrap_slow(edge, n, max);
+}
+
+// What input pixels contribute to this output pixel?
+static void stbir__calculate_sample_range_upsample(int n, float out_filter_radius, float scale_ratio, float out_shift, int* in_first_pixel, int* in_last_pixel, float* in_center_of_out)
+{
+    float out_pixel_center = (float)n + 0.5f;
+    float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
+    float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
+
+    float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) / scale_ratio;
+    float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) / scale_ratio;
+
+    *in_center_of_out = (out_pixel_center + out_shift) / scale_ratio;
+    *in_first_pixel = (int)(floor(in_pixel_influence_lowerbound + 0.5));
+    *in_last_pixel = (int)(floor(in_pixel_influence_upperbound - 0.5));
+}
+
+// What output pixels does this input pixel contribute to?
+static void stbir__calculate_sample_range_downsample(int n, float in_pixels_radius, float scale_ratio, float out_shift, int* out_first_pixel, int* out_last_pixel, float* out_center_of_in)
+{
+    float in_pixel_center = (float)n + 0.5f;
+    float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
+    float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
+
+    float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale_ratio - out_shift;
+    float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale_ratio - out_shift;
+
+    *out_center_of_in = in_pixel_center * scale_ratio - out_shift;
+    *out_first_pixel = (int)(floor(out_pixel_influence_lowerbound + 0.5));
+    *out_last_pixel = (int)(floor(out_pixel_influence_upperbound - 0.5));
+}
+
+static void stbir__calculate_coefficients_upsample(stbir_filter filter, float scale, int in_first_pixel, int in_last_pixel, float in_center_of_out, stbir__contributors* contributor, float* coefficient_group)
+{
+    int i;
+    float total_filter = 0;
+    float filter_scale;
+
+    STBIR_ASSERT(in_last_pixel - in_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(1/scale) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
+
+    contributor->n0 = in_first_pixel;
+    contributor->n1 = in_last_pixel;
+
+    STBIR_ASSERT(contributor->n1 >= contributor->n0);
+
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+    {
+        float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
+        coefficient_group[i] = stbir__filter_info_table[filter].kernel(in_center_of_out - in_pixel_center, 1 / scale);
+
+        // If the coefficient is zero, skip it. (Don't do the <0 check here, we want the influence of those outside pixels.)
+        if (i == 0 && !coefficient_group[i])
+        {
+            contributor->n0 = ++in_first_pixel;
+            i--;
+            continue;
+        }
+
+        total_filter += coefficient_group[i];
+    }
+
+    // NOTE(fg): Not actually true in general, nor is there any reason to expect it should be.
+    // It would be true in exact math but is at best approximately true in floating-point math,
+    // and it would not make sense to try and put actual bounds on this here because it depends
+    // on the image aspect ratio which can get pretty extreme.
+    //STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(in_last_pixel + 1) + 0.5f - in_center_of_out, 1/scale) == 0);
+
+    STBIR_ASSERT(total_filter > 0.9);
+    STBIR_ASSERT(total_filter < 1.1f); // Make sure it's not way off.
+
+    // Make sure the sum of all coefficients is 1.
+    filter_scale = 1 / total_filter;
+
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+        coefficient_group[i] *= filter_scale;
+
+    for (i = in_last_pixel - in_first_pixel; i >= 0; i--)
+    {
+        if (coefficient_group[i])
+            break;
+
+        // This line has no weight. We can skip it.
+        contributor->n1 = contributor->n0 + i - 1;
+    }
+}
+
+static void stbir__calculate_coefficients_downsample(stbir_filter filter, float scale_ratio, int out_first_pixel, int out_last_pixel, float out_center_of_in, stbir__contributors* contributor, float* coefficient_group)
+{
+    int i;
+
+    STBIR_ASSERT(out_last_pixel - out_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(scale_ratio) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
+
+    contributor->n0 = out_first_pixel;
+    contributor->n1 = out_last_pixel;
+
+    STBIR_ASSERT(contributor->n1 >= contributor->n0);
+
+    for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
+    {
+        float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
+        float x = out_pixel_center - out_center_of_in;
+        coefficient_group[i] = stbir__filter_info_table[filter].kernel(x, scale_ratio) * scale_ratio;
+    }
+
+    // NOTE(fg): Not actually true in general, nor is there any reason to expect it should be.
+    // It would be true in exact math but is at best approximately true in floating-point math,
+    // and it would not make sense to try and put actual bounds on this here because it depends
+    // on the image aspect ratio which can get pretty extreme.
+    //STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(out_last_pixel + 1) + 0.5f - out_center_of_in, scale_ratio) == 0);
+
+    for (i = out_last_pixel - out_first_pixel; i >= 0; i--)
+    {
+        if (coefficient_group[i])
+            break;
+
+        // This line has no weight. We can skip it.
+        contributor->n1 = contributor->n0 + i - 1;
+    }
+}
+
+static void stbir__normalize_downsample_coefficients(stbir__contributors* contributors, float* coefficients, stbir_filter filter, float scale_ratio, int input_size, int output_size)
+{
+    int num_contributors = stbir__get_contributors(scale_ratio, filter, input_size, output_size);
+    int num_coefficients = stbir__get_coefficient_width(filter, scale_ratio);
+    int i, j;
+    int skip;
+
+    for (i = 0; i < output_size; i++)
+    {
+        float scale;
+        float total = 0;
+
+        for (j = 0; j < num_contributors; j++)
+        {
+            if (i >= contributors[j].n0 && i <= contributors[j].n1)
+            {
+                float coefficient = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i - contributors[j].n0);
+                total += coefficient;
+            }
+            else if (i < contributors[j].n0)
+                break;
+        }
+
+        STBIR_ASSERT(total > 0.9f);
+        STBIR_ASSERT(total < 1.1f);
+
+        scale = 1 / total;
+
+        for (j = 0; j < num_contributors; j++)
+        {
+            if (i >= contributors[j].n0 && i <= contributors[j].n1)
+                *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i - contributors[j].n0) *= scale;
+            else if (i < contributors[j].n0)
+                break;
+        }
+    }
+
+    // Optimize: Skip zero coefficients and contributions outside of image bounds.
+    // Do this after normalizing because normalization depends on the n0/n1 values.
+    for (j = 0; j < num_contributors; j++)
+    {
+        int range, max, width;
+
+        skip = 0;
+        while (*stbir__get_coefficient(coefficients, filter, scale_ratio, j, skip) == 0)
+            skip++;
+
+        contributors[j].n0 += skip;
+
+        while (contributors[j].n0 < 0)
+        {
+            contributors[j].n0++;
+            skip++;
+        }
+
+        range = contributors[j].n1 - contributors[j].n0 + 1;
+        max = stbir__min(num_coefficients, range);
+
+        width = stbir__get_coefficient_width(filter, scale_ratio);
+        for (i = 0; i < max; i++)
+        {
+            if (i + skip >= width)
+                break;
+
+            *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i) = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i + skip);
+        }
+
+        continue;
+    }
+
+    // Using min to avoid writing into invalid pixels.
+    for (i = 0; i < num_contributors; i++)
+        contributors[i].n1 = stbir__min(contributors[i].n1, output_size - 1);
+}
+
+// Each scan line uses the same kernel values so we should calculate the kernel
+// values once and then we can use them for every scan line.
+static void stbir__calculate_filters(stbir__contributors* contributors, float* coefficients, stbir_filter filter, float scale_ratio, float shift, int input_size, int output_size)
+{
+    int n;
+    int total_contributors = stbir__get_contributors(scale_ratio, filter, input_size, output_size);
+
+    if (stbir__use_upsampling(scale_ratio))
+    {
+        float out_pixels_radius = stbir__filter_info_table[filter].support(1 / scale_ratio) * scale_ratio;
+
+        // Looping through out pixels
+        for (n = 0; n < total_contributors; n++)
+        {
+            float in_center_of_out; // Center of the current out pixel in the in pixel space
+            int in_first_pixel, in_last_pixel;
+
+            stbir__calculate_sample_range_upsample(n, out_pixels_radius, scale_ratio, shift, &in_first_pixel, &in_last_pixel, &in_center_of_out);
+
+            stbir__calculate_coefficients_upsample(filter, scale_ratio, in_first_pixel, in_last_pixel, in_center_of_out, stbir__get_contributor(contributors, n), stbir__get_coefficient(coefficients, filter, scale_ratio, n, 0));
+        }
+    }
+    else
+    {
+        float in_pixels_radius = stbir__filter_info_table[filter].support(scale_ratio) / scale_ratio;
+
+        // Looping through in pixels
+        for (n = 0; n < total_contributors; n++)
+        {
+            float out_center_of_in; // Center of the current out pixel in the in pixel space
+            int out_first_pixel, out_last_pixel;
+            int n_adjusted = n - stbir__get_filter_pixel_margin(filter, scale_ratio);
+
+            stbir__calculate_sample_range_downsample(n_adjusted, in_pixels_radius, scale_ratio, shift, &out_first_pixel, &out_last_pixel, &out_center_of_in);
+
+            stbir__calculate_coefficients_downsample(filter, scale_ratio, out_first_pixel, out_last_pixel, out_center_of_in, stbir__get_contributor(contributors, n), stbir__get_coefficient(coefficients, filter, scale_ratio, n, 0));
+        }
+
+        stbir__normalize_downsample_coefficients(contributors, coefficients, filter, scale_ratio, input_size, output_size);
+    }
+}
+
+static float* stbir__get_decode_buffer(stbir__info* stbir_info)
+{
+    // The 0 index of the decode buffer starts after the margin. This makes
+    // it okay to use negative indexes on the decode buffer.
+    return &stbir_info->decode_buffer[stbir_info->horizontal_filter_pixel_margin * stbir_info->channels];
+}
+
+#define STBIR__DECODE(type, colorspace) ((int)(type) * (STBIR_MAX_COLORSPACES) + (int)(colorspace))
+
+static void stbir__decode_scanline(stbir__info* stbir_info, int n)
+{
+    int c;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int input_w = stbir_info->input_w;
+    size_t input_stride_bytes = stbir_info->input_stride_bytes;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir_edge edge_horizontal = stbir_info->edge_horizontal;
+    stbir_edge edge_vertical = stbir_info->edge_vertical;
+    size_t in_buffer_row_offset = stbir__edge_wrap(edge_vertical, n, stbir_info->input_h) * input_stride_bytes;
+    const void* input_data = (char *) stbir_info->input_data + in_buffer_row_offset;
+    int max_x = input_w + stbir_info->horizontal_filter_pixel_margin;
+    int decode = STBIR__DECODE(type, colorspace);
+
+    int x = -stbir_info->horizontal_filter_pixel_margin;
+
+    // special handling for STBIR_EDGE_ZERO because it needs to return an item that doesn't appear in the input,
+    // and we want to avoid paying overhead on every pixel if not STBIR_EDGE_ZERO
+    if (edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->input_h))
+    {
+        for (; x < max_x; x++)
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        return;
+    }
+
+    switch (decode)
+    {
+    case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((float)((const unsigned char*)input_data)[input_pixel_index + c]) / stbir__max_uint8_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_uchar_to_linear_float[((const unsigned char*)input_data)[input_pixel_index + c]];
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((float)((const unsigned char*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint8_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((float)((const unsigned short*)input_data)[input_pixel_index + c]) / stbir__max_uint16_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear(((float)((const unsigned short*)input_data)[input_pixel_index + c]) / stbir__max_uint16_as_float);
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((float)((const unsigned short*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint16_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = (float)(((double)((const unsigned int*)input_data)[input_pixel_index + c]) / stbir__max_uint32_as_float);
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear((float)(((double)((const unsigned int*)input_data)[input_pixel_index + c]) / stbir__max_uint32_as_float));
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = (float)(((double)((const unsigned int*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint32_as_float);
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((const float*)input_data)[input_pixel_index + c];
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear(((const float*)input_data)[input_pixel_index + c]);
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((const float*)input_data)[input_pixel_index + alpha_channel];
+        }
+
+        break;
+
+    default:
+        STBIR_ASSERT(!"Unknown type/colorspace/channels combination.");
+        break;
+    }
+
+    if (!(stbir_info->flags & STBIR_FLAG_ALPHA_PREMULTIPLIED))
+    {
+        for (x = -stbir_info->horizontal_filter_pixel_margin; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+
+            // If the alpha value is 0 it will clobber the color values. Make sure it's not.
+            float alpha = decode_buffer[decode_pixel_index + alpha_channel];
+#ifndef STBIR_NO_ALPHA_EPSILON
+            if (stbir_info->type != STBIR_TYPE_FLOAT) {
+                alpha += STBIR_ALPHA_EPSILON;
+                decode_buffer[decode_pixel_index + alpha_channel] = alpha;
+            }
+#endif
+            for (c = 0; c < channels; c++)
+            {
+                if (c == alpha_channel)
+                    continue;
+
+                decode_buffer[decode_pixel_index + c] *= alpha;
+            }
+        }
+    }
+
+    if (edge_horizontal == STBIR_EDGE_ZERO)
+    {
+        for (x = -stbir_info->horizontal_filter_pixel_margin; x < 0; x++)
+        {
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        }
+        for (x = input_w; x < max_x; x++)
+        {
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        }
+    }
+}
+
+static float* stbir__get_ring_buffer_entry(float* ring_buffer, int index, int ring_buffer_length)
+{
+    return &ring_buffer[index * ring_buffer_length];
+}
+
+static float* stbir__add_empty_ring_buffer_entry(stbir__info* stbir_info, int n)
+{
+    int ring_buffer_index;
+    float* ring_buffer;
+
+    stbir_info->ring_buffer_last_scanline = n;
+
+    if (stbir_info->ring_buffer_begin_index < 0)
+    {
+        ring_buffer_index = stbir_info->ring_buffer_begin_index = 0;
+        stbir_info->ring_buffer_first_scanline = n;
+    }
+    else
+    {
+        ring_buffer_index = (stbir_info->ring_buffer_begin_index + (stbir_info->ring_buffer_last_scanline - stbir_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
+        STBIR_ASSERT(ring_buffer_index != stbir_info->ring_buffer_begin_index);
+    }
+
+    ring_buffer = stbir__get_ring_buffer_entry(stbir_info->ring_buffer, ring_buffer_index, stbir_info->ring_buffer_length_bytes / sizeof(float));
+    memset(ring_buffer, 0, stbir_info->ring_buffer_length_bytes);
+
+    return ring_buffer;
+}
+
+
+static void stbir__resample_horizontal_upsample(stbir__info* stbir_info, float* output_buffer)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    int channels = stbir_info->channels;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
+    float* horizontal_coefficients = stbir_info->horizontal_coefficients;
+    int coefficient_width = stbir_info->horizontal_coefficient_width;
+
+    for (x = 0; x < output_w; x++)
+    {
+        int n0 = horizontal_contributors[x].n0;
+        int n1 = horizontal_contributors[x].n1;
+
+        int out_pixel_index = x * channels;
+        int coefficient_group = coefficient_width * x;
+        int coefficient_counter = 0;
+
+        STBIR_ASSERT(n1 >= n0);
+        STBIR_ASSERT(n0 >= -stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n1 >= -stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n0 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n1 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
+
+        switch (channels) {
+            case 1:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 1;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                }
+                break;
+            case 2:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 2;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                }
+                break;
+            case 3:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 3;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                }
+                break;
+            case 4:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 4;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                    output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
+                }
+                break;
+            default:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * channels;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    int c;
+                    STBIR_ASSERT(coefficient != 0);
+                    for (c = 0; c < channels; c++)
+                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
+                }
+                break;
+        }
+    }
+}
+
+static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float* output_buffer)
+{
+    int x, k;
+    int input_w = stbir_info->input_w;
+    int channels = stbir_info->channels;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
+    float* horizontal_coefficients = stbir_info->horizontal_coefficients;
+    int coefficient_width = stbir_info->horizontal_coefficient_width;
+    int filter_pixel_margin = stbir_info->horizontal_filter_pixel_margin;
+    int max_x = input_w + filter_pixel_margin * 2;
+
+    STBIR_ASSERT(!stbir__use_width_upsampling(stbir_info));
+
+    switch (channels) {
+        case 1:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 1;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 1;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                }
+            }
+            break;
+
+        case 2:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 2;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 2;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                }
+            }
+            break;
+
+        case 3:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 3;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 3;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                }
+            }
+            break;
+
+        case 4:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 4;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 4;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                    output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
+                }
+            }
+            break;
+
+        default:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * channels;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int c;
+                    int out_pixel_index = k * channels;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    for (c = 0; c < channels; c++)
+                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
+                }
+            }
+            break;
+    }
+}
+
+static void stbir__decode_and_resample_upsample(stbir__info* stbir_info, int n)
+{
+    // Decode the nth scanline from the source image into the decode buffer.
+    stbir__decode_scanline(stbir_info, n);
+
+    // Now resample it into the ring buffer.
+    if (stbir__use_width_upsampling(stbir_info))
+        stbir__resample_horizontal_upsample(stbir_info, stbir__add_empty_ring_buffer_entry(stbir_info, n));
+    else
+        stbir__resample_horizontal_downsample(stbir_info, stbir__add_empty_ring_buffer_entry(stbir_info, n));
+
+    // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
+}
+
+static void stbir__decode_and_resample_downsample(stbir__info* stbir_info, int n)
+{
+    // Decode the nth scanline from the source image into the decode buffer.
+    stbir__decode_scanline(stbir_info, n);
+
+    memset(stbir_info->horizontal_buffer, 0, stbir_info->output_w * stbir_info->channels * sizeof(float));
+
+    // Now resample it into the horizontal buffer.
+    if (stbir__use_width_upsampling(stbir_info))
+        stbir__resample_horizontal_upsample(stbir_info, stbir_info->horizontal_buffer);
+    else
+        stbir__resample_horizontal_downsample(stbir_info, stbir_info->horizontal_buffer);
+
+    // Now it's sitting in the horizontal buffer ready to be distributed into the ring buffers.
+}
+
+// Get the specified scan line from the ring buffer.
+static float* stbir__get_ring_buffer_scanline(int get_scanline, float* ring_buffer, int begin_index, int first_scanline, int ring_buffer_num_entries, int ring_buffer_length)
+{
+    int ring_buffer_index = (begin_index + (get_scanline - first_scanline)) % ring_buffer_num_entries;
+    return stbir__get_ring_buffer_entry(ring_buffer, ring_buffer_index, ring_buffer_length);
+}
+
+
+static void stbir__encode_scanline(stbir__info* stbir_info, int num_pixels, void *output_buffer, float *encode_buffer, int channels, int alpha_channel, int decode)
+{
+    int x;
+    int n;
+    int num_nonalpha;
+    stbir_uint16 nonalpha[STBIR_MAX_CHANNELS];
+
+    if (!(stbir_info->flags&STBIR_FLAG_ALPHA_PREMULTIPLIED))
+    {
+        for (x=0; x < num_pixels; ++x)
+        {
+            int pixel_index = x*channels;
+
+            float alpha = encode_buffer[pixel_index + alpha_channel];
+            float reciprocal_alpha = alpha ? 1.0f / alpha : 0;
+
+            // unrolling this produced a 1% slowdown upscaling a large RGBA linear-space image on my machine - stb
+            for (n = 0; n < channels; n++)
+                if (n != alpha_channel)
+                    encode_buffer[pixel_index + n] *= reciprocal_alpha;
+
+            // We added in a small epsilon to prevent the color channel from being deleted with zero alpha.
+            // Because we only add it for integer types, it will automatically be discarded on integer
+            // conversion, so we don't need to subtract it back out (which would be problematic for
+            // numeric precision reasons).
+        }
+    }
+
+    // build a table of all channels that need colorspace correction, so
+    // we don't perform colorspace correction on channels that don't need it.
+    for (x = 0, num_nonalpha = 0; x < channels; ++x)
+    {
+        if (x != alpha_channel || (stbir_info->flags & STBIR_FLAG_ALPHA_USES_COLORSPACE))
+        {
+            nonalpha[num_nonalpha++] = (stbir_uint16)x;
+        }
+    }
+
+    #define STBIR__ROUND_INT(f)    ((int)          ((f)+0.5))
+    #define STBIR__ROUND_UINT(f)   ((stbir_uint32) ((f)+0.5))
+
+    #ifdef STBIR__SATURATE_INT
+    #define STBIR__ENCODE_LINEAR8(f)   stbir__saturate8 (STBIR__ROUND_INT((f) * stbir__max_uint8_as_float ))
+    #define STBIR__ENCODE_LINEAR16(f)  stbir__saturate16(STBIR__ROUND_INT((f) * stbir__max_uint16_as_float))
+    #else
+    #define STBIR__ENCODE_LINEAR8(f)   (unsigned char ) STBIR__ROUND_INT(stbir__saturate(f) * stbir__max_uint8_as_float )
+    #define STBIR__ENCODE_LINEAR16(f)  (unsigned short) STBIR__ROUND_INT(stbir__saturate(f) * stbir__max_uint16_as_float)
+    #endif
+
+    switch (decode)
+    {
+        case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned char*)output_buffer)[index] = STBIR__ENCODE_LINEAR8(encode_buffer[index]);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned char*)output_buffer)[index] = stbir__linear_to_srgb_uchar(encode_buffer[index]);
+                }
+
+                if (!(stbir_info->flags & STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned char *)output_buffer)[pixel_index + alpha_channel] = STBIR__ENCODE_LINEAR8(encode_buffer[pixel_index+alpha_channel]);
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned short*)output_buffer)[index] = STBIR__ENCODE_LINEAR16(encode_buffer[index]);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned short*)output_buffer)[index] = (unsigned short)STBIR__ROUND_INT(stbir__linear_to_srgb(stbir__saturate(encode_buffer[index])) * stbir__max_uint16_as_float);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned short*)output_buffer)[pixel_index + alpha_channel] = STBIR__ENCODE_LINEAR16(encode_buffer[pixel_index + alpha_channel]);
+            }
+
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned int*)output_buffer)[index] = (unsigned int)STBIR__ROUND_UINT(((double)stbir__saturate(encode_buffer[index])) * stbir__max_uint32_as_float);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned int*)output_buffer)[index] = (unsigned int)STBIR__ROUND_UINT(((double)stbir__linear_to_srgb(stbir__saturate(encode_buffer[index]))) * stbir__max_uint32_as_float);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned int*)output_buffer)[pixel_index + alpha_channel] = (unsigned int)STBIR__ROUND_INT(((double)stbir__saturate(encode_buffer[pixel_index + alpha_channel])) * stbir__max_uint32_as_float);
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((float*)output_buffer)[index] = encode_buffer[index];
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((float*)output_buffer)[index] = stbir__linear_to_srgb(encode_buffer[index]);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((float*)output_buffer)[pixel_index + alpha_channel] = encode_buffer[pixel_index + alpha_channel];
+            }
+            break;
+
+        default:
+            STBIR_ASSERT(!"Unknown type/colorspace/channels combination.");
+            break;
+    }
+}
+
+static void stbir__resample_vertical_upsample(stbir__info* stbir_info, int n)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
+    float* vertical_coefficients = stbir_info->vertical_coefficients;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int ring_buffer_entries = stbir_info->ring_buffer_num_entries;
+    void* output_data = stbir_info->output_data;
+    float* encode_buffer = stbir_info->encode_buffer;
+    int decode = STBIR__DECODE(type, colorspace);
+    int coefficient_width = stbir_info->vertical_coefficient_width;
+    int coefficient_counter;
+    int contributor = n;
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
+    int ring_buffer_first_scanline = stbir_info->ring_buffer_first_scanline;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+
+    int n0,n1, output_row_start;
+    int coefficient_group = coefficient_width * contributor;
+
+    n0 = vertical_contributors[contributor].n0;
+    n1 = vertical_contributors[contributor].n1;
+
+    output_row_start = n * stbir_info->output_stride_bytes;
+
+    STBIR_ASSERT(stbir__use_height_upsampling(stbir_info));
+
+    memset(encode_buffer, 0, output_w * sizeof(float) * channels);
+
+    // I tried reblocking this for better cache usage of encode_buffer
+    // (using x_outer, k, x_inner), but it lost speed. -- stb
+
+    coefficient_counter = 0;
+    switch (channels) {
+        case 1:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 1;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                }
+            }
+            break;
+        case 2:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 2;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                }
+            }
+            break;
+        case 3:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 3;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                    encode_buffer[in_pixel_index + 2] += ring_buffer_entry[in_pixel_index + 2] * coefficient;
+                }
+            }
+            break;
+        case 4:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 4;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                    encode_buffer[in_pixel_index + 2] += ring_buffer_entry[in_pixel_index + 2] * coefficient;
+                    encode_buffer[in_pixel_index + 3] += ring_buffer_entry[in_pixel_index + 3] * coefficient;
+                }
+            }
+            break;
+        default:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * channels;
+                    int c;
+                    for (c = 0; c < channels; c++)
+                        encode_buffer[in_pixel_index + c] += ring_buffer_entry[in_pixel_index + c] * coefficient;
+                }
+            }
+            break;
+    }
+    stbir__encode_scanline(stbir_info, output_w, (char *) output_data + output_row_start, encode_buffer, channels, alpha_channel, decode);
+}
+
+static void stbir__resample_vertical_downsample(stbir__info* stbir_info, int n)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
+    float* vertical_coefficients = stbir_info->vertical_coefficients;
+    int channels = stbir_info->channels;
+    int ring_buffer_entries = stbir_info->ring_buffer_num_entries;
+    float* horizontal_buffer = stbir_info->horizontal_buffer;
+    int coefficient_width = stbir_info->vertical_coefficient_width;
+    int contributor = n + stbir_info->vertical_filter_pixel_margin;
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
+    int ring_buffer_first_scanline = stbir_info->ring_buffer_first_scanline;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+    int n0,n1;
+
+    n0 = vertical_contributors[contributor].n0;
+    n1 = vertical_contributors[contributor].n1;
+
+    STBIR_ASSERT(!stbir__use_height_upsampling(stbir_info));
+
+    for (k = n0; k <= n1; k++)
+    {
+        int coefficient_index = k - n0;
+        int coefficient_group = coefficient_width * contributor;
+        float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+
+        float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+
+        switch (channels) {
+            case 1:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 1;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                }
+                break;
+            case 2:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 2;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                }
+                break;
+            case 3:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 3;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 2] += horizontal_buffer[in_pixel_index + 2] * coefficient;
+                }
+                break;
+            case 4:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 4;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 2] += horizontal_buffer[in_pixel_index + 2] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 3] += horizontal_buffer[in_pixel_index + 3] * coefficient;
+                }
+                break;
+            default:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * channels;
+
+                    int c;
+                    for (c = 0; c < channels; c++)
+                        ring_buffer_entry[in_pixel_index + c] += horizontal_buffer[in_pixel_index + c] * coefficient;
+                }
+                break;
+        }
+    }
+}
+
+static void stbir__buffer_loop_upsample(stbir__info* stbir_info)
+{
+    int y;
+    float scale_ratio = stbir_info->vertical_scale;
+    float out_scanlines_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(1/scale_ratio) * scale_ratio;
+
+    STBIR_ASSERT(stbir__use_height_upsampling(stbir_info));
+
+    for (y = 0; y < stbir_info->output_h; y++)
+    {
+        float in_center_of_out = 0; // Center of the current out scanline in the in scanline space
+        int in_first_scanline = 0, in_last_scanline = 0;
+
+        stbir__calculate_sample_range_upsample(y, out_scanlines_radius, scale_ratio, stbir_info->vertical_shift, &in_first_scanline, &in_last_scanline, &in_center_of_out);
+
+        STBIR_ASSERT(in_last_scanline - in_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+        if (stbir_info->ring_buffer_begin_index >= 0)
+        {
+            // Get rid of whatever we don't need anymore.
+            while (in_first_scanline > stbir_info->ring_buffer_first_scanline)
+            {
+                if (stbir_info->ring_buffer_first_scanline == stbir_info->ring_buffer_last_scanline)
+                {
+                    // We just popped the last scanline off the ring buffer.
+                    // Reset it to the empty state.
+                    stbir_info->ring_buffer_begin_index = -1;
+                    stbir_info->ring_buffer_first_scanline = 0;
+                    stbir_info->ring_buffer_last_scanline = 0;
+                    break;
+                }
+                else
+                {
+                    stbir_info->ring_buffer_first_scanline++;
+                    stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->ring_buffer_num_entries;
+                }
+            }
+        }
+
+        // Load in new ones.
+        if (stbir_info->ring_buffer_begin_index < 0)
+            stbir__decode_and_resample_upsample(stbir_info, in_first_scanline);
+
+        while (in_last_scanline > stbir_info->ring_buffer_last_scanline)
+            stbir__decode_and_resample_upsample(stbir_info, stbir_info->ring_buffer_last_scanline + 1);
+
+        // Now all buffers should be ready to write a row of vertical sampling.
+        stbir__resample_vertical_upsample(stbir_info, y);
+
+        STBIR_PROGRESS_REPORT((float)y / stbir_info->output_h);
+    }
+}
+
+static void stbir__empty_ring_buffer(stbir__info* stbir_info, int first_necessary_scanline)
+{
+    int output_stride_bytes = stbir_info->output_stride_bytes;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int output_w = stbir_info->output_w;
+    void* output_data = stbir_info->output_data;
+    int decode = STBIR__DECODE(type, colorspace);
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+
+    if (stbir_info->ring_buffer_begin_index >= 0)
+    {
+        // Get rid of whatever we don't need anymore.
+        while (first_necessary_scanline > stbir_info->ring_buffer_first_scanline)
+        {
+            if (stbir_info->ring_buffer_first_scanline >= 0 && stbir_info->ring_buffer_first_scanline < stbir_info->output_h)
+            {
+                int output_row_start = stbir_info->ring_buffer_first_scanline * output_stride_bytes;
+                float* ring_buffer_entry = stbir__get_ring_buffer_entry(ring_buffer, stbir_info->ring_buffer_begin_index, ring_buffer_length);
+                stbir__encode_scanline(stbir_info, output_w, (char *) output_data + output_row_start, ring_buffer_entry, channels, alpha_channel, decode);
+                STBIR_PROGRESS_REPORT((float)stbir_info->ring_buffer_first_scanline / stbir_info->output_h);
+            }
+
+            if (stbir_info->ring_buffer_first_scanline == stbir_info->ring_buffer_last_scanline)
+            {
+                // We just popped the last scanline off the ring buffer.
+                // Reset it to the empty state.
+                stbir_info->ring_buffer_begin_index = -1;
+                stbir_info->ring_buffer_first_scanline = 0;
+                stbir_info->ring_buffer_last_scanline = 0;
+                break;
+            }
+            else
+            {
+                stbir_info->ring_buffer_first_scanline++;
+                stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->ring_buffer_num_entries;
+            }
+        }
+    }
+}
+
+static void stbir__buffer_loop_downsample(stbir__info* stbir_info)
+{
+    int y;
+    float scale_ratio = stbir_info->vertical_scale;
+    int output_h = stbir_info->output_h;
+    float in_pixels_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(scale_ratio) / scale_ratio;
+    int pixel_margin = stbir_info->vertical_filter_pixel_margin;
+    int max_y = stbir_info->input_h + pixel_margin;
+
+    STBIR_ASSERT(!stbir__use_height_upsampling(stbir_info));
+
+    for (y = -pixel_margin; y < max_y; y++)
+    {
+        float out_center_of_in; // Center of the current out scanline in the in scanline space
+        int out_first_scanline, out_last_scanline;
+
+        stbir__calculate_sample_range_downsample(y, in_pixels_radius, scale_ratio, stbir_info->vertical_shift, &out_first_scanline, &out_last_scanline, &out_center_of_in);
+
+        STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+        if (out_last_scanline < 0 || out_first_scanline >= output_h)
+            continue;
+
+        stbir__empty_ring_buffer(stbir_info, out_first_scanline);
+
+        stbir__decode_and_resample_downsample(stbir_info, y);
+
+        // Load in new ones.
+        if (stbir_info->ring_buffer_begin_index < 0)
+            stbir__add_empty_ring_buffer_entry(stbir_info, out_first_scanline);
+
+        while (out_last_scanline > stbir_info->ring_buffer_last_scanline)
+            stbir__add_empty_ring_buffer_entry(stbir_info, stbir_info->ring_buffer_last_scanline + 1);
+
+        // Now the horizontal buffer is ready to write to all ring buffer rows.
+        stbir__resample_vertical_downsample(stbir_info, y);
+    }
+
+    stbir__empty_ring_buffer(stbir_info, stbir_info->output_h);
+}
+
+static void stbir__setup(stbir__info *info, int input_w, int input_h, int output_w, int output_h, int channels)
+{
+    info->input_w = input_w;
+    info->input_h = input_h;
+    info->output_w = output_w;
+    info->output_h = output_h;
+    info->channels = channels;
+}
+
+static void stbir__calculate_transform(stbir__info *info, float s0, float t0, float s1, float t1, float *transform)
+{
+    info->s0 = s0;
+    info->t0 = t0;
+    info->s1 = s1;
+    info->t1 = t1;
+
+    if (transform)
+    {
+        info->horizontal_scale = transform[0];
+        info->vertical_scale   = transform[1];
+        info->horizontal_shift = transform[2];
+        info->vertical_shift   = transform[3];
+    }
+    else
+    {
+        info->horizontal_scale = ((float)info->output_w / info->input_w) / (s1 - s0);
+        info->vertical_scale = ((float)info->output_h / info->input_h) / (t1 - t0);
+
+        info->horizontal_shift = s0 * info->output_w / (s1 - s0);
+        info->vertical_shift = t0 * info->output_h / (t1 - t0);
+    }
+}
+
+static void stbir__choose_filter(stbir__info *info, stbir_filter h_filter, stbir_filter v_filter)
+{
+    if (h_filter == 0)
+        h_filter = stbir__use_upsampling(info->horizontal_scale) ? STBIR_DEFAULT_FILTER_UPSAMPLE : STBIR_DEFAULT_FILTER_DOWNSAMPLE;
+    if (v_filter == 0)
+        v_filter = stbir__use_upsampling(info->vertical_scale)   ? STBIR_DEFAULT_FILTER_UPSAMPLE : STBIR_DEFAULT_FILTER_DOWNSAMPLE;
+    info->horizontal_filter = h_filter;
+    info->vertical_filter = v_filter;
+}
+
+static stbir_uint32 stbir__calculate_memory(stbir__info *info)
+{
+    int pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
+    int filter_height = stbir__get_filter_pixel_width(info->vertical_filter, info->vertical_scale);
+
+    info->horizontal_num_contributors = stbir__get_contributors(info->horizontal_scale, info->horizontal_filter, info->input_w, info->output_w);
+    info->vertical_num_contributors   = stbir__get_contributors(info->vertical_scale  , info->vertical_filter  , info->input_h, info->output_h);
+
+    // One extra entry because floating point precision problems sometimes cause an extra to be necessary.
+    info->ring_buffer_num_entries = filter_height + 1;
+
+    info->horizontal_contributors_size = info->horizontal_num_contributors * sizeof(stbir__contributors);
+    info->horizontal_coefficients_size = stbir__get_total_horizontal_coefficients(info) * sizeof(float);
+    info->vertical_contributors_size = info->vertical_num_contributors * sizeof(stbir__contributors);
+    info->vertical_coefficients_size = stbir__get_total_vertical_coefficients(info) * sizeof(float);
+    info->decode_buffer_size = (info->input_w + pixel_margin * 2) * info->channels * sizeof(float);
+    info->horizontal_buffer_size = info->output_w * info->channels * sizeof(float);
+    info->ring_buffer_size = info->output_w * info->channels * info->ring_buffer_num_entries * sizeof(float);
+    info->encode_buffer_size = info->output_w * info->channels * sizeof(float);
+
+    STBIR_ASSERT(info->horizontal_filter != 0);
+    STBIR_ASSERT(info->horizontal_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table)); // this now happens too late
+    STBIR_ASSERT(info->vertical_filter != 0);
+    STBIR_ASSERT(info->vertical_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table)); // this now happens too late
+
+    if (stbir__use_height_upsampling(info))
+        // The horizontal buffer is for when we're downsampling the height and we
+        // can't output the result of sampling the decode buffer directly into the
+        // ring buffers.
+        info->horizontal_buffer_size = 0;
+    else
+        // The encode buffer is to retain precision in the height upsampling method
+        // and isn't used when height downsampling.
+        info->encode_buffer_size = 0;
+
+    return info->horizontal_contributors_size + info->horizontal_coefficients_size
+        + info->vertical_contributors_size + info->vertical_coefficients_size
+        + info->decode_buffer_size + info->horizontal_buffer_size
+        + info->ring_buffer_size + info->encode_buffer_size;
+}
+
+static int stbir__resize_allocated(stbir__info *info,
+    const void* input_data, int input_stride_in_bytes,
+    void* output_data, int output_stride_in_bytes,
+    int alpha_channel, stbir_uint32 flags, stbir_datatype type,
+    stbir_edge edge_horizontal, stbir_edge edge_vertical, stbir_colorspace colorspace,
+    void* tempmem, size_t tempmem_size_in_bytes)
+{
+    size_t memory_required = stbir__calculate_memory(info);
+
+    int width_stride_input = input_stride_in_bytes ? input_stride_in_bytes : info->channels * info->input_w * stbir__type_size[type];
+    int width_stride_output = output_stride_in_bytes ? output_stride_in_bytes : info->channels * info->output_w * stbir__type_size[type];
+
+#ifdef STBIR_DEBUG_OVERWRITE_TEST
+#define OVERWRITE_ARRAY_SIZE 8
+    unsigned char overwrite_output_before_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_tempmem_before_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_output_after_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_tempmem_after_pre[OVERWRITE_ARRAY_SIZE];
+
+    size_t begin_forbidden = width_stride_output * (info->output_h - 1) + info->output_w * info->channels * stbir__type_size[type];
+    memcpy(overwrite_output_before_pre, &((unsigned char*)output_data)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_output_after_pre, &((unsigned char*)output_data)[begin_forbidden], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_tempmem_before_pre, &((unsigned char*)tempmem)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_tempmem_after_pre, &((unsigned char*)tempmem)[tempmem_size_in_bytes], OVERWRITE_ARRAY_SIZE);
+#endif
+
+    STBIR_ASSERT(info->channels >= 0);
+    STBIR_ASSERT(info->channels <= STBIR_MAX_CHANNELS);
+
+    if (info->channels < 0 || info->channels > STBIR_MAX_CHANNELS)
+        return 0;
+
+    STBIR_ASSERT(info->horizontal_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+    STBIR_ASSERT(info->vertical_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+
+    if (info->horizontal_filter >= STBIR__ARRAY_SIZE(stbir__filter_info_table))
+        return 0;
+    if (info->vertical_filter >= STBIR__ARRAY_SIZE(stbir__filter_info_table))
+        return 0;
+
+    if (alpha_channel < 0)
+        flags |= STBIR_FLAG_ALPHA_USES_COLORSPACE | STBIR_FLAG_ALPHA_PREMULTIPLIED;
+
+    if (!(flags&STBIR_FLAG_ALPHA_USES_COLORSPACE) || !(flags&STBIR_FLAG_ALPHA_PREMULTIPLIED)) {
+        STBIR_ASSERT(alpha_channel >= 0 && alpha_channel < info->channels);
+    }
+
+    if (alpha_channel >= info->channels)
+        return 0;
+
+    STBIR_ASSERT(tempmem);
+
+    if (!tempmem)
+        return 0;
+
+    STBIR_ASSERT(tempmem_size_in_bytes >= memory_required);
+
+    if (tempmem_size_in_bytes < memory_required)
+        return 0;
+
+    memset(tempmem, 0, tempmem_size_in_bytes);
+
+    info->input_data = input_data;
+    info->input_stride_bytes = width_stride_input;
+
+    info->output_data = output_data;
+    info->output_stride_bytes = width_stride_output;
+
+    info->alpha_channel = alpha_channel;
+    info->flags = flags;
+    info->type = type;
+    info->edge_horizontal = edge_horizontal;
+    info->edge_vertical = edge_vertical;
+    info->colorspace = colorspace;
+
+    info->horizontal_coefficient_width   = stbir__get_coefficient_width  (info->horizontal_filter, info->horizontal_scale);
+    info->vertical_coefficient_width     = stbir__get_coefficient_width  (info->vertical_filter  , info->vertical_scale  );
+    info->horizontal_filter_pixel_width  = stbir__get_filter_pixel_width (info->horizontal_filter, info->horizontal_scale);
+    info->vertical_filter_pixel_width    = stbir__get_filter_pixel_width (info->vertical_filter  , info->vertical_scale  );
+    info->horizontal_filter_pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
+    info->vertical_filter_pixel_margin   = stbir__get_filter_pixel_margin(info->vertical_filter  , info->vertical_scale  );
+
+    info->ring_buffer_length_bytes = info->output_w * info->channels * sizeof(float);
+    info->decode_buffer_pixels = info->input_w + info->horizontal_filter_pixel_margin * 2;
+
+#define STBIR__NEXT_MEMPTR(current, newtype) (newtype*)(((unsigned char*)current) + current##_size)
+
+    info->horizontal_contributors = (stbir__contributors *) tempmem;
+    info->horizontal_coefficients = STBIR__NEXT_MEMPTR(info->horizontal_contributors, float);
+    info->vertical_contributors = STBIR__NEXT_MEMPTR(info->horizontal_coefficients, stbir__contributors);
+    info->vertical_coefficients = STBIR__NEXT_MEMPTR(info->vertical_contributors, float);
+    info->decode_buffer = STBIR__NEXT_MEMPTR(info->vertical_coefficients, float);
+
+    if (stbir__use_height_upsampling(info))
+    {
+        info->horizontal_buffer = NULL;
+        info->ring_buffer = STBIR__NEXT_MEMPTR(info->decode_buffer, float);
+        info->encode_buffer = STBIR__NEXT_MEMPTR(info->ring_buffer, float);
+
+        STBIR_ASSERT((size_t)STBIR__NEXT_MEMPTR(info->encode_buffer, unsigned char) == (size_t)tempmem + tempmem_size_in_bytes);
+    }
+    else
+    {
+        info->horizontal_buffer = STBIR__NEXT_MEMPTR(info->decode_buffer, float);
+        info->ring_buffer = STBIR__NEXT_MEMPTR(info->horizontal_buffer, float);
+        info->encode_buffer = NULL;
+
+        STBIR_ASSERT((size_t)STBIR__NEXT_MEMPTR(info->ring_buffer, unsigned char) == (size_t)tempmem + tempmem_size_in_bytes);
+    }
+
+#undef STBIR__NEXT_MEMPTR
+
+    // This signals that the ring buffer is empty
+    info->ring_buffer_begin_index = -1;
+
+    stbir__calculate_filters(info->horizontal_contributors, info->horizontal_coefficients, info->horizontal_filter, info->horizontal_scale, info->horizontal_shift, info->input_w, info->output_w);
+    stbir__calculate_filters(info->vertical_contributors, info->vertical_coefficients, info->vertical_filter, info->vertical_scale, info->vertical_shift, info->input_h, info->output_h);
+
+    STBIR_PROGRESS_REPORT(0);
+
+    if (stbir__use_height_upsampling(info))
+        stbir__buffer_loop_upsample(info);
+    else
+        stbir__buffer_loop_downsample(info);
+
+    STBIR_PROGRESS_REPORT(1);
+
+#ifdef STBIR_DEBUG_OVERWRITE_TEST
+    STBIR_ASSERT(memcmp(overwrite_output_before_pre, &((unsigned char*)output_data)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_output_after_pre, &((unsigned char*)output_data)[begin_forbidden], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_tempmem_before_pre, &((unsigned char*)tempmem)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_tempmem_after_pre, &((unsigned char*)tempmem)[tempmem_size_in_bytes], OVERWRITE_ARRAY_SIZE) == 0);
+#endif
+
+    return 1;
+}
+
+
+static int stbir__resize_arbitrary(
+    void *alloc_context,
+    const void* input_data, int input_w, int input_h, int input_stride_in_bytes,
+    void* output_data, int output_w, int output_h, int output_stride_in_bytes,
+    float s0, float t0, float s1, float t1, float *transform,
+    int channels, int alpha_channel, stbir_uint32 flags, stbir_datatype type,
+    stbir_filter h_filter, stbir_filter v_filter,
+    stbir_edge edge_horizontal, stbir_edge edge_vertical, stbir_colorspace colorspace)
+{
+    stbir__info info;
+    int result;
+    size_t memory_required;
+    void* extra_memory;
+
+    stbir__setup(&info, input_w, input_h, output_w, output_h, channels);
+    stbir__calculate_transform(&info, s0,t0,s1,t1,transform);
+    stbir__choose_filter(&info, h_filter, v_filter);
+    memory_required = stbir__calculate_memory(&info);
+    extra_memory = STBIR_MALLOC(memory_required, alloc_context);
+
+    if (!extra_memory)
+        return 0;
+
+    result = stbir__resize_allocated(&info, input_data, input_stride_in_bytes,
+                                            output_data, output_stride_in_bytes,
+                                            alpha_channel, flags, type,
+                                            edge_horizontal, edge_vertical,
+                                            colorspace, extra_memory, memory_required);
+
+    STBIR_FREE(extra_memory, alloc_context);
+
+    return result;
+}
+
+STBIRDEF int stbir_resize_uint8(     const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,-1,0, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_LINEAR);
+}
+
+STBIRDEF int stbir_resize_float(     const float *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,-1,0, STBIR_TYPE_FLOAT, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_LINEAR);
+}
+
+STBIRDEF int stbir_resize_uint8_srgb(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels, int alpha_channel, int flags)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_SRGB);
+}
+
+STBIRDEF int stbir_resize_uint8_srgb_edgemode(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                                    unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                              int num_channels, int alpha_channel, int flags,
+                                              stbir_edge edge_wrap_mode)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        edge_wrap_mode, edge_wrap_mode, STBIR_COLORSPACE_SRGB);
+}
+
+STBIRDEF int stbir_resize_uint8_generic( const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                               unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space,
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+STBIRDEF int stbir_resize_uint16_generic(const stbir_uint16 *input_pixels  , int input_w , int input_h , int input_stride_in_bytes,
+                                               stbir_uint16 *output_pixels , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space,
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT16, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+
+STBIRDEF int stbir_resize_float_generic( const float *input_pixels         , int input_w , int input_h , int input_stride_in_bytes,
+                                               float *output_pixels        , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space,
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_FLOAT, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+
+STBIRDEF int stbir_resize(         const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical,
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+
+STBIRDEF int stbir_resize_subpixel(const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical,
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float x_scale, float y_scale,
+                                   float x_offset, float y_offset)
+{
+    float transform[4];
+    transform[0] = x_scale;
+    transform[1] = y_scale;
+    transform[2] = x_offset;
+    transform[3] = y_offset;
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,transform,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+STBIRDEF int stbir_resize_region(  const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical,
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float s0, float t0, float s1, float t1)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        s0,t0,s1,t1,NULL,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+#endif // STB_IMAGE_RESIZE_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/Source/vk2dfd.cpp b/Source/vk2dfd.cpp
new file mode 100644
index 0000000..46c6532
--- /dev/null
+++ b/Source/vk2dfd.cpp
@@ -0,0 +1,33 @@
+/* -*- tab-width: 4; -*- */
+/* vi: set sw=2 ts=4 expandtab: */
+
+/* Copyright 2019-2020 Mark Callow
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @file
+ * @~English
+ * @brief Create a DFD for a VkFormat.
+ */
+
+#include "dfd.h"
+
+/**
+ * @~English
+ * @brief Create a DFD matching a VkFormat.
+ *
+ * @param[in] format    VkFormat for which to create a DFD.
+ *
+ * @return      pointer to the created DFD or 0 if format not supported or
+ *              unrecognized. Caller is responsible for freeing the created
+ *              DFD.
+ */
+uint32_t * vk2dfd(enum VkFormat format)
+ {
+     switch (format) {
+#include "vk2dfd.inl"
+         default: return 0;
+     }
+ }
+
diff --git a/Source/vk2dfd.inl b/Source/vk2dfd.inl
new file mode 100644
index 0000000..7ceb2fd
--- /dev/null
+++ b/Source/vk2dfd.inl
@@ -0,0 +1,294 @@
+/* Copyright 2019-2020 The Khronos Group Inc. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/***************************** Do not edit.  *****************************
+             Automatically generated by makevk2dfd.pl.
+ *************************************************************************/
+
+/* Vulkan combined depth & stencil formats are not included here
+ * because they do not exist outside a Vulkan device.
+ */
+case VK_FORMAT_R4G4_UNORM_PACK8: {
+    int channels[] = {1,0}; int bits[] = {4,4};
+    return createDFDPacked(0, 2, bits, channels, s_UNORM);
+}
+case VK_FORMAT_R4G4B4A4_UNORM_PACK16: {
+    int channels[] = {3,2,1,0}; int bits[] = {4,4,4,4};
+    return createDFDPacked(0, 4, bits, channels, s_UNORM);
+}
+case VK_FORMAT_B4G4R4A4_UNORM_PACK16: {
+    int channels[] = {3,0,1,2}; int bits[] = {4,4,4,4};
+    return createDFDPacked(0, 4, bits, channels, s_UNORM);
+}
+case VK_FORMAT_R5G6B5_UNORM_PACK16: {
+    int channels[] = {2,1,0}; int bits[] = {5,6,5};
+    return createDFDPacked(0, 3, bits, channels, s_UNORM);
+}
+case VK_FORMAT_B5G6R5_UNORM_PACK16: {
+    int channels[] = {0,1,2}; int bits[] = {5,6,5};
+    return createDFDPacked(0, 3, bits, channels, s_UNORM);
+}
+case VK_FORMAT_R5G5B5A1_UNORM_PACK16: {
+    int channels[] = {3,2,1,0}; int bits[] = {1,5,5,5};
+    return createDFDPacked(0, 4, bits, channels, s_UNORM);
+}
+case VK_FORMAT_B5G5R5A1_UNORM_PACK16: {
+    int channels[] = {3,0,1,2}; int bits[] = {1,5,5,5};
+    return createDFDPacked(0, 4, bits, channels, s_UNORM);
+}
+case VK_FORMAT_A1R5G5B5_UNORM_PACK16: {
+    int channels[] = {2,1,0,3}; int bits[] = {5,5,5,1};
+    return createDFDPacked(0, 4, bits, channels, s_UNORM);
+}
+case VK_FORMAT_R8_UNORM: return createDFDUnpacked(0, 1, 1, 0, s_UNORM);
+case VK_FORMAT_R8_SNORM: return createDFDUnpacked(0, 1, 1, 0, s_SNORM);
+case VK_FORMAT_R8_USCALED: return createDFDUnpacked(0, 1, 1, 0, s_USCALED);
+case VK_FORMAT_R8_SSCALED: return createDFDUnpacked(0, 1, 1, 0, s_SSCALED);
+case VK_FORMAT_R8_UINT: return createDFDUnpacked(0, 1, 1, 0, s_UINT);
+case VK_FORMAT_R8_SINT: return createDFDUnpacked(0, 1, 1, 0, s_SINT);
+case VK_FORMAT_R8_SRGB: return createDFDUnpacked(0, 1, 1, 0, s_SRGB);
+case VK_FORMAT_R8G8_UNORM: return createDFDUnpacked(0, 2, 1, 0, s_UNORM);
+case VK_FORMAT_R8G8_SNORM: return createDFDUnpacked(0, 2, 1, 0, s_SNORM);
+case VK_FORMAT_R8G8_USCALED: return createDFDUnpacked(0, 2, 1, 0, s_USCALED);
+case VK_FORMAT_R8G8_SSCALED: return createDFDUnpacked(0, 2, 1, 0, s_SSCALED);
+case VK_FORMAT_R8G8_UINT: return createDFDUnpacked(0, 2, 1, 0, s_UINT);
+case VK_FORMAT_R8G8_SINT: return createDFDUnpacked(0, 2, 1, 0, s_SINT);
+case VK_FORMAT_R8G8_SRGB: return createDFDUnpacked(0, 2, 1, 0, s_SRGB);
+case VK_FORMAT_R8G8B8_UNORM: return createDFDUnpacked(0, 3, 1, 0, s_UNORM);
+case VK_FORMAT_R8G8B8_SNORM: return createDFDUnpacked(0, 3, 1, 0, s_SNORM);
+case VK_FORMAT_R8G8B8_USCALED: return createDFDUnpacked(0, 3, 1, 0, s_USCALED);
+case VK_FORMAT_R8G8B8_SSCALED: return createDFDUnpacked(0, 3, 1, 0, s_SSCALED);
+case VK_FORMAT_R8G8B8_UINT: return createDFDUnpacked(0, 3, 1, 0, s_UINT);
+case VK_FORMAT_R8G8B8_SINT: return createDFDUnpacked(0, 3, 1, 0, s_SINT);
+case VK_FORMAT_R8G8B8_SRGB: return createDFDUnpacked(0, 3, 1, 0, s_SRGB);
+case VK_FORMAT_B8G8R8_UNORM: return createDFDUnpacked(0, 3, 1, 1, s_UNORM);
+case VK_FORMAT_B8G8R8_SNORM: return createDFDUnpacked(0, 3, 1, 1, s_SNORM);
+case VK_FORMAT_B8G8R8_USCALED: return createDFDUnpacked(0, 3, 1, 1, s_USCALED);
+case VK_FORMAT_B8G8R8_SSCALED: return createDFDUnpacked(0, 3, 1, 1, s_SSCALED);
+case VK_FORMAT_B8G8R8_UINT: return createDFDUnpacked(0, 3, 1, 1, s_UINT);
+case VK_FORMAT_B8G8R8_SINT: return createDFDUnpacked(0, 3, 1, 1, s_SINT);
+case VK_FORMAT_B8G8R8_SRGB: return createDFDUnpacked(0, 3, 1, 1, s_SRGB);
+case VK_FORMAT_R8G8B8A8_UNORM: return createDFDUnpacked(0, 4, 1, 0, s_UNORM);
+case VK_FORMAT_R8G8B8A8_SNORM: return createDFDUnpacked(0, 4, 1, 0, s_SNORM);
+case VK_FORMAT_R8G8B8A8_USCALED: return createDFDUnpacked(0, 4, 1, 0, s_USCALED);
+case VK_FORMAT_R8G8B8A8_SSCALED: return createDFDUnpacked(0, 4, 1, 0, s_SSCALED);
+case VK_FORMAT_R8G8B8A8_UINT: return createDFDUnpacked(0, 4, 1, 0, s_UINT);
+case VK_FORMAT_R8G8B8A8_SINT: return createDFDUnpacked(0, 4, 1, 0, s_SINT);
+case VK_FORMAT_R8G8B8A8_SRGB: return createDFDUnpacked(0, 4, 1, 0, s_SRGB);
+case VK_FORMAT_B8G8R8A8_UNORM: return createDFDUnpacked(0, 4, 1, 1, s_UNORM);
+case VK_FORMAT_B8G8R8A8_SNORM: return createDFDUnpacked(0, 4, 1, 1, s_SNORM);
+case VK_FORMAT_B8G8R8A8_USCALED: return createDFDUnpacked(0, 4, 1, 1, s_USCALED);
+case VK_FORMAT_B8G8R8A8_SSCALED: return createDFDUnpacked(0, 4, 1, 1, s_SSCALED);
+case VK_FORMAT_B8G8R8A8_UINT: return createDFDUnpacked(0, 4, 1, 1, s_UINT);
+case VK_FORMAT_B8G8R8A8_SINT: return createDFDUnpacked(0, 4, 1, 1, s_SINT);
+case VK_FORMAT_B8G8R8A8_SRGB: return createDFDUnpacked(0, 4, 1, 1, s_SRGB);
+case VK_FORMAT_A8B8G8R8_UNORM_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {8,8,8,8};
+    return createDFDPacked(0, 4, bits, channels, s_UNORM);
+}
+case VK_FORMAT_A8B8G8R8_SNORM_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {8,8,8,8};
+    return createDFDPacked(0, 4, bits, channels, s_SNORM);
+}
+case VK_FORMAT_A8B8G8R8_USCALED_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {8,8,8,8};
+    return createDFDPacked(0, 4, bits, channels, s_USCALED);
+}
+case VK_FORMAT_A8B8G8R8_SSCALED_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {8,8,8,8};
+    return createDFDPacked(0, 4, bits, channels, s_SSCALED);
+}
+case VK_FORMAT_A8B8G8R8_UINT_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {8,8,8,8};
+    return createDFDPacked(0, 4, bits, channels, s_UINT);
+}
+case VK_FORMAT_A8B8G8R8_SINT_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {8,8,8,8};
+    return createDFDPacked(0, 4, bits, channels, s_SINT);
+}
+case VK_FORMAT_A8B8G8R8_SRGB_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {8,8,8,8};
+    return createDFDPacked(0, 4, bits, channels, s_SRGB);
+}
+case VK_FORMAT_A2R10G10B10_UNORM_PACK32: {
+    int channels[] = {2,1,0,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_UNORM);
+}
+case VK_FORMAT_A2R10G10B10_SNORM_PACK32: {
+    int channels[] = {2,1,0,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_SNORM);
+}
+case VK_FORMAT_A2R10G10B10_USCALED_PACK32: {
+    int channels[] = {2,1,0,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_USCALED);
+}
+case VK_FORMAT_A2R10G10B10_SSCALED_PACK32: {
+    int channels[] = {2,1,0,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_SSCALED);
+}
+case VK_FORMAT_A2R10G10B10_UINT_PACK32: {
+    int channels[] = {2,1,0,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_UINT);
+}
+case VK_FORMAT_A2R10G10B10_SINT_PACK32: {
+    int channels[] = {2,1,0,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_SINT);
+}
+case VK_FORMAT_A2B10G10R10_UNORM_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_UNORM);
+}
+case VK_FORMAT_A2B10G10R10_SNORM_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_SNORM);
+}
+case VK_FORMAT_A2B10G10R10_USCALED_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_USCALED);
+}
+case VK_FORMAT_A2B10G10R10_SSCALED_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_SSCALED);
+}
+case VK_FORMAT_A2B10G10R10_UINT_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_UINT);
+}
+case VK_FORMAT_A2B10G10R10_SINT_PACK32: {
+    int channels[] = {0,1,2,3}; int bits[] = {10,10,10,2};
+    return createDFDPacked(0, 4, bits, channels, s_SINT);
+}
+case VK_FORMAT_R16_UNORM: return createDFDUnpacked(0, 1, 2, 0, s_UNORM);
+case VK_FORMAT_R16_SNORM: return createDFDUnpacked(0, 1, 2, 0, s_SNORM);
+case VK_FORMAT_R16_USCALED: return createDFDUnpacked(0, 1, 2, 0, s_USCALED);
+case VK_FORMAT_R16_SSCALED: return createDFDUnpacked(0, 1, 2, 0, s_SSCALED);
+case VK_FORMAT_R16_UINT: return createDFDUnpacked(0, 1, 2, 0, s_UINT);
+case VK_FORMAT_R16_SINT: return createDFDUnpacked(0, 1, 2, 0, s_SINT);
+case VK_FORMAT_R16_SFLOAT: return createDFDUnpacked(0, 1, 2, 0, s_SFLOAT);
+case VK_FORMAT_R16G16_UNORM: return createDFDUnpacked(0, 2, 2, 0, s_UNORM);
+case VK_FORMAT_R16G16_SNORM: return createDFDUnpacked(0, 2, 2, 0, s_SNORM);
+case VK_FORMAT_R16G16_USCALED: return createDFDUnpacked(0, 2, 2, 0, s_USCALED);
+case VK_FORMAT_R16G16_SSCALED: return createDFDUnpacked(0, 2, 2, 0, s_SSCALED);
+case VK_FORMAT_R16G16_UINT: return createDFDUnpacked(0, 2, 2, 0, s_UINT);
+case VK_FORMAT_R16G16_SINT: return createDFDUnpacked(0, 2, 2, 0, s_SINT);
+case VK_FORMAT_R16G16_SFLOAT: return createDFDUnpacked(0, 2, 2, 0, s_SFLOAT);
+case VK_FORMAT_R16G16B16_UNORM: return createDFDUnpacked(0, 3, 2, 0, s_UNORM);
+case VK_FORMAT_R16G16B16_SNORM: return createDFDUnpacked(0, 3, 2, 0, s_SNORM);
+case VK_FORMAT_R16G16B16_USCALED: return createDFDUnpacked(0, 3, 2, 0, s_USCALED);
+case VK_FORMAT_R16G16B16_SSCALED: return createDFDUnpacked(0, 3, 2, 0, s_SSCALED);
+case VK_FORMAT_R16G16B16_UINT: return createDFDUnpacked(0, 3, 2, 0, s_UINT);
+case VK_FORMAT_R16G16B16_SINT: return createDFDUnpacked(0, 3, 2, 0, s_SINT);
+case VK_FORMAT_R16G16B16_SFLOAT: return createDFDUnpacked(0, 3, 2, 0, s_SFLOAT);
+case VK_FORMAT_R16G16B16A16_UNORM: return createDFDUnpacked(0, 4, 2, 0, s_UNORM);
+case VK_FORMAT_R16G16B16A16_SNORM: return createDFDUnpacked(0, 4, 2, 0, s_SNORM);
+case VK_FORMAT_R16G16B16A16_USCALED: return createDFDUnpacked(0, 4, 2, 0, s_USCALED);
+case VK_FORMAT_R16G16B16A16_SSCALED: return createDFDUnpacked(0, 4, 2, 0, s_SSCALED);
+case VK_FORMAT_R16G16B16A16_UINT: return createDFDUnpacked(0, 4, 2, 0, s_UINT);
+case VK_FORMAT_R16G16B16A16_SINT: return createDFDUnpacked(0, 4, 2, 0, s_SINT);
+case VK_FORMAT_R16G16B16A16_SFLOAT: return createDFDUnpacked(0, 4, 2, 0, s_SFLOAT);
+case VK_FORMAT_R32_UINT: return createDFDUnpacked(0, 1, 4, 0, s_UINT);
+case VK_FORMAT_R32_SINT: return createDFDUnpacked(0, 1, 4, 0, s_SINT);
+case VK_FORMAT_R32_SFLOAT: return createDFDUnpacked(0, 1, 4, 0, s_SFLOAT);
+case VK_FORMAT_R32G32_UINT: return createDFDUnpacked(0, 2, 4, 0, s_UINT);
+case VK_FORMAT_R32G32_SINT: return createDFDUnpacked(0, 2, 4, 0, s_SINT);
+case VK_FORMAT_R32G32_SFLOAT: return createDFDUnpacked(0, 2, 4, 0, s_SFLOAT);
+case VK_FORMAT_R32G32B32_UINT: return createDFDUnpacked(0, 3, 4, 0, s_UINT);
+case VK_FORMAT_R32G32B32_SINT: return createDFDUnpacked(0, 3, 4, 0, s_SINT);
+case VK_FORMAT_R32G32B32_SFLOAT: return createDFDUnpacked(0, 3, 4, 0, s_SFLOAT);
+case VK_FORMAT_R32G32B32A32_UINT: return createDFDUnpacked(0, 4, 4, 0, s_UINT);
+case VK_FORMAT_R32G32B32A32_SINT: return createDFDUnpacked(0, 4, 4, 0, s_SINT);
+case VK_FORMAT_R32G32B32A32_SFLOAT: return createDFDUnpacked(0, 4, 4, 0, s_SFLOAT);
+case VK_FORMAT_R64_UINT: return createDFDUnpacked(0, 1, 8, 0, s_UINT);
+case VK_FORMAT_R64_SINT: return createDFDUnpacked(0, 1, 8, 0, s_SINT);
+case VK_FORMAT_R64_SFLOAT: return createDFDUnpacked(0, 1, 8, 0, s_SFLOAT);
+case VK_FORMAT_R64G64_UINT: return createDFDUnpacked(0, 2, 8, 0, s_UINT);
+case VK_FORMAT_R64G64_SINT: return createDFDUnpacked(0, 2, 8, 0, s_SINT);
+case VK_FORMAT_R64G64_SFLOAT: return createDFDUnpacked(0, 2, 8, 0, s_SFLOAT);
+case VK_FORMAT_R64G64B64_UINT: return createDFDUnpacked(0, 3, 8, 0, s_UINT);
+case VK_FORMAT_R64G64B64_SINT: return createDFDUnpacked(0, 3, 8, 0, s_SINT);
+case VK_FORMAT_R64G64B64_SFLOAT: return createDFDUnpacked(0, 3, 8, 0, s_SFLOAT);
+case VK_FORMAT_R64G64B64A64_UINT: return createDFDUnpacked(0, 4, 8, 0, s_UINT);
+case VK_FORMAT_R64G64B64A64_SINT: return createDFDUnpacked(0, 4, 8, 0, s_SINT);
+case VK_FORMAT_R64G64B64A64_SFLOAT: return createDFDUnpacked(0, 4, 8, 0, s_SFLOAT);
+case VK_FORMAT_B10G11R11_UFLOAT_PACK32: {
+    int channels[] = {0,1,2}; int bits[] = {11,11,10};
+    return createDFDPacked(0, 3, bits, channels, s_UFLOAT);
+}
+case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: {
+    int bits[] = {0}; int channels[] = {0};
+    return createDFDPacked(0, 6, bits, channels, s_UFLOAT);
+}
+case VK_FORMAT_D16_UNORM: return createDFDDepthStencil(16,0,2);
+case VK_FORMAT_X8_D24_UNORM_PACK32: return createDFDDepthStencil(24,0,4);
+case VK_FORMAT_D32_SFLOAT: return createDFDDepthStencil(32,0,4);
+case VK_FORMAT_S8_UINT: return createDFDDepthStencil(0,8,1);
+case VK_FORMAT_BC1_RGB_UNORM_BLOCK: return createDFDCompressed(c_BC1_RGB, 4, 4, 1, s_UNORM);
+case VK_FORMAT_BC1_RGB_SRGB_BLOCK: return createDFDCompressed(c_BC1_RGB, 4, 4, 1, s_SRGB);
+case VK_FORMAT_BC1_RGBA_UNORM_BLOCK: return createDFDCompressed(c_BC1_RGBA, 4, 4, 1, s_UNORM);
+case VK_FORMAT_BC1_RGBA_SRGB_BLOCK: return createDFDCompressed(c_BC1_RGBA, 4, 4, 1, s_SRGB);
+case VK_FORMAT_BC2_UNORM_BLOCK: return createDFDCompressed(c_BC2, 4, 4, 1, s_UNORM);
+case VK_FORMAT_BC2_SRGB_BLOCK: return createDFDCompressed(c_BC2, 4, 4, 1, s_SRGB);
+case VK_FORMAT_BC3_UNORM_BLOCK: return createDFDCompressed(c_BC3, 4, 4, 1, s_UNORM);
+case VK_FORMAT_BC3_SRGB_BLOCK: return createDFDCompressed(c_BC3, 4, 4, 1, s_SRGB);
+case VK_FORMAT_BC4_UNORM_BLOCK: return createDFDCompressed(c_BC4, 4, 4, 1, s_UNORM);
+case VK_FORMAT_BC4_SNORM_BLOCK: return createDFDCompressed(c_BC4, 4, 4, 1, s_SNORM);
+case VK_FORMAT_BC5_UNORM_BLOCK: return createDFDCompressed(c_BC5, 4, 4, 1, s_UNORM);
+case VK_FORMAT_BC5_SNORM_BLOCK: return createDFDCompressed(c_BC5, 4, 4, 1, s_SNORM);
+case VK_FORMAT_BC6H_UFLOAT_BLOCK: return createDFDCompressed(c_BC6H, 4, 4, 1, s_UFLOAT);
+case VK_FORMAT_BC6H_SFLOAT_BLOCK: return createDFDCompressed(c_BC6H, 4, 4, 1, s_SFLOAT);
+case VK_FORMAT_BC7_UNORM_BLOCK: return createDFDCompressed(c_BC7, 4, 4, 1, s_UNORM);
+case VK_FORMAT_BC7_SRGB_BLOCK: return createDFDCompressed(c_BC7, 4, 4, 1, s_SRGB);
+case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK: return createDFDCompressed(c_ETC2_R8G8B8, 4, 4, 1, s_UNORM);
+case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK: return createDFDCompressed(c_ETC2_R8G8B8, 4, 4, 1, s_SRGB);
+case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK: return createDFDCompressed(c_ETC2_R8G8B8A1, 4, 4, 1, s_UNORM);
+case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK: return createDFDCompressed(c_ETC2_R8G8B8A1, 4, 4, 1, s_SRGB);
+case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK: return createDFDCompressed(c_ETC2_R8G8B8A8, 4, 4, 1, s_UNORM);
+case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK: return createDFDCompressed(c_ETC2_R8G8B8A8, 4, 4, 1, s_SRGB);
+case VK_FORMAT_EAC_R11_UNORM_BLOCK: return createDFDCompressed(c_EAC_R11, 4, 4, 1, s_UNORM);
+case VK_FORMAT_EAC_R11_SNORM_BLOCK: return createDFDCompressed(c_EAC_R11, 4, 4, 1, s_SNORM);
+case VK_FORMAT_EAC_R11G11_UNORM_BLOCK: return createDFDCompressed(c_EAC_R11G11, 4, 4, 1, s_UNORM);
+case VK_FORMAT_EAC_R11G11_SNORM_BLOCK: return createDFDCompressed(c_EAC_R11G11, 4, 4, 1, s_SNORM);
+case VK_FORMAT_ASTC_4x4_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 4, 4, 1, s_UNORM);
+case VK_FORMAT_ASTC_4x4_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 4, 4, 1, s_SRGB);
+case VK_FORMAT_ASTC_5x4_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 5, 4, 1, s_UNORM);
+case VK_FORMAT_ASTC_5x4_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 5, 4, 1, s_SRGB);
+case VK_FORMAT_ASTC_5x5_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 5, 5, 1, s_UNORM);
+case VK_FORMAT_ASTC_5x5_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 5, 5, 1, s_SRGB);
+case VK_FORMAT_ASTC_6x5_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 6, 5, 1, s_UNORM);
+case VK_FORMAT_ASTC_6x5_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 6, 5, 1, s_SRGB);
+case VK_FORMAT_ASTC_6x6_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 6, 6, 1, s_UNORM);
+case VK_FORMAT_ASTC_6x6_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 6, 6, 1, s_SRGB);
+case VK_FORMAT_ASTC_8x5_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 8, 5, 1, s_UNORM);
+case VK_FORMAT_ASTC_8x5_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 8, 5, 1, s_SRGB);
+case VK_FORMAT_ASTC_8x6_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 8, 6, 1, s_UNORM);
+case VK_FORMAT_ASTC_8x6_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 8, 6, 1, s_SRGB);
+case VK_FORMAT_ASTC_8x8_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 8, 8, 1, s_UNORM);
+case VK_FORMAT_ASTC_8x8_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 8, 8, 1, s_SRGB);
+case VK_FORMAT_ASTC_10x5_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 10, 5, 1, s_UNORM);
+case VK_FORMAT_ASTC_10x5_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 10, 5, 1, s_SRGB);
+case VK_FORMAT_ASTC_10x6_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 10, 6, 1, s_UNORM);
+case VK_FORMAT_ASTC_10x6_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 10, 6, 1, s_SRGB);
+case VK_FORMAT_ASTC_10x8_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 10, 8, 1, s_UNORM);
+case VK_FORMAT_ASTC_10x8_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 10, 8, 1, s_SRGB);
+case VK_FORMAT_ASTC_10x10_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 10, 10, 1, s_UNORM);
+case VK_FORMAT_ASTC_10x10_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 10, 10, 1, s_SRGB);
+case VK_FORMAT_ASTC_12x10_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 12, 10, 1, s_UNORM);
+case VK_FORMAT_ASTC_12x10_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 12, 10, 1, s_SRGB);
+case VK_FORMAT_ASTC_12x12_UNORM_BLOCK: return createDFDCompressed(c_ASTC, 12, 12, 1, s_UNORM);
+case VK_FORMAT_ASTC_12x12_SRGB_BLOCK: return createDFDCompressed(c_ASTC, 12, 12, 1, s_SRGB);
+case VK_FORMAT_PVRTC1_2BPP_UNORM_BLOCK_IMG: return createDFDCompressed(c_PVRTC, 8, 4, 1, s_UNORM);
+case VK_FORMAT_PVRTC1_4BPP_UNORM_BLOCK_IMG: return createDFDCompressed(c_PVRTC, 4, 4, 1, s_UNORM);
+case VK_FORMAT_PVRTC2_2BPP_UNORM_BLOCK_IMG: return createDFDCompressed(c_PVRTC2, 8, 4, 1, s_UNORM);
+case VK_FORMAT_PVRTC2_4BPP_UNORM_BLOCK_IMG: return createDFDCompressed(c_PVRTC2, 4, 4, 1, s_UNORM);
+case VK_FORMAT_PVRTC1_2BPP_SRGB_BLOCK_IMG: return createDFDCompressed(c_PVRTC, 8, 4, 1, s_SRGB);
+case VK_FORMAT_PVRTC1_4BPP_SRGB_BLOCK_IMG: return createDFDCompressed(c_PVRTC, 4, 4, 1, s_SRGB);
+case VK_FORMAT_PVRTC2_2BPP_SRGB_BLOCK_IMG: return createDFDCompressed(c_PVRTC2, 8, 4, 1, s_SRGB);
+case VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG: return createDFDCompressed(c_PVRTC2, 4, 4, 1, s_SRGB);
+case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT: {
+    int channels[] = {2,1,0,3}; int bits[] = {4,4,4,4};
+    return createDFDPacked(0, 4, bits, channels, s_UNORM);
+}
+case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT: {
+    int channels[] = {0,1,2,3}; int bits[] = {4,4,4,4};
+    return createDFDPacked(0, 4, bits, channels, s_UNORM);
+}
diff --git a/build/.keep b/build/.keep
new file mode 100644
index 0000000..e69de29
diff --git a/meson.build b/meson.build
new file mode 100644
index 0000000..b295e4a
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,3 @@
+project('TextureTaffy', 'cpp')
+
+subdir('Source')
\ No newline at end of file