diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml
index 8eedc0ab..8edfedba 100644
--- a/.github/workflows/ccp-workflow.yml
+++ b/.github/workflows/ccp-workflow.yml
@@ -1,32 +1,144 @@
 # taken from https://github.com/onqtam/doctest/blob/master/.github/workflows/main.yml
 
 name: C/C++ CI
-on: push
+on:
+  push:
+  pull_request:
+    types: [opened, reopened, synchronize]
+  workflow_dispatch:
+    inputs:
+      job_to_run:
+        description: 'Select the job to run manually'
+        type: 'choice'
+        options:
+          - 'build_arm32'
+        default: 'build_arm32'
+      log_level:
+        description: 'Log level'
+        required: false
+        default: 'info'
+
 
 jobs:
   build:
-    name: main build for Unix-like
-    runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          { system: MacOS,          runner: macos-latest },
+          { system: Ubuntu-22,      runner: ubuntu-22.04 },
+          { system: Ubuntu-latest,  runner: ubuntu-latest },
+        ]
+    name: ${{ matrix.system }} Build
+    runs-on: ${{ matrix.runner }}
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.job_to_run != 'build_arm32' }}
+    steps:
+    - uses: actions/checkout@v5
+    - name: cmake
+      run: cmake -DOJPH_BUILD_STREAM_EXPAND=ON ..
+      working-directory: build
+    - name: build
+      run: make
+      working-directory: build
+
+  build_mac:
+    strategy:
+      fail-fast: false
       matrix:
-        os: [macos-latest, ubuntu-20.04, ubuntu-latest]
+        include: [
+          { system: MacOS Dual Build, runner: macos-latest },
+        ]
+    name: ${{ matrix.system }} Build
+    runs-on: ${{ matrix.runner }}
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.job_to_run != 'build_arm32' }}
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v5
     - name: cmake
-      run: cmake ..
+      run: cmake -DOJPH_BUILD_STREAM_EXPAND=ON -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" -DOJPH_ENABLE_TIFF_SUPPORT=OFF ..
       working-directory: build
     - name: build
       run: make
       working-directory: build
 
+  build_windows:
+    strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          { system: Windows,  runner: windows-latest },
+        ]
+    name: ${{ matrix.system }} Build
+    runs-on: ${{ matrix.runner }}
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.job_to_run != 'build_arm32' }}
+    steps:
+    - uses: actions/checkout@v5
+    - name: cmake
+      run: cmake -G "Visual Studio 17 2022" -A x64 -DOJPH_ENABLE_TIFF_SUPPORT=OFF -DOJPH_BUILD_STREAM_EXPAND=ON ..
+      working-directory: build
+    - name: build
+      run: cmake --build . --config Release
+      working-directory: build
+
+  build_msys2:
+    strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          { system: Windows-MSYS2,  runner: windows-latest },
+        ]
+    name: ${{ matrix.system }} Build
+    runs-on: ${{ matrix.runner }}
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.job_to_run != 'build_arm32' }}
+    defaults:
+      run:
+        shell: msys2 {0}
+    steps:
+    - uses: actions/checkout@v5
+    - uses: msys2/setup-msys2@v2
+      with:
+        msystem: UCRT64
+        update: false
+        pacboy: cc:p cmake:p libtiff:p
+    - name: cmake
+      run: cmake -DOJPH_BUILD_STREAM_EXPAND=ON ..
+      working-directory: build
+    - name: build
+      run: cmake --build . --config Release
+      working-directory: build
+
+  build_windows_on_arm:
+    strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          { system: WindowsOnARM,  runner: windows-11-arm },
+        ]
+    name: ${{ matrix.system }} Build
+    runs-on: ${{ matrix.runner }}
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.job_to_run != 'build_arm32' }}
+    steps:
+    - uses: actions/checkout@v5
+    - name: cmake
+      run: cmake -G "Visual Studio 17 2022" -A ARM64 -DOJPH_ENABLE_TIFF_SUPPORT=OFF -DOJPH_BUILD_STREAM_EXPAND=ON ..
+      working-directory: build
+    - name: build
+      run: cmake --build . --config Release
+      working-directory: build
+
   test:
-    name: tests on Linux and MacOS
-    runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [macos-latest, ubuntu-latest]
+        include: [
+          { system: MacOS-Intel,    runner: macos-15-intel },
+          { system: MacOS-latest,   runner: macos-latest },
+          { system: Ubuntu-latest,  runner: ubuntu-latest },
+        ]
+    name: ${{ matrix.system }} Test
+    runs-on: ${{ matrix.runner }}
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.job_to_run != 'build_arm32' }}
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v5
     - name: cmake
       run: cmake -DOJPH_BUILD_TESTS=yes ..
       working-directory: build
@@ -36,17 +148,21 @@ jobs:
     - name: test
       run: ctest --output-on-failure
       working-directory: build
-      
+
   test_windows:
-    name: tests on Windows
-    runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [windows-latest]
+        include: [
+          { system: Windows,  runner: windows-latest },
+        ]
+    name: ${{ matrix.system }} Test
+    runs-on: ${{ matrix.runner }}
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.job_to_run != 'build_arm32' }}
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v5
     - name: cmake
-      run: cmake -G "Visual Studio 17 2022" -A x64 -DOJPH_ENABLE_TIFF_SUPPORT=off -DOJPH_BUILD_TESTS=yes ..
+      run: cmake -G "Visual Studio 17 2022" -A x64 -DOJPH_ENABLE_TIFF_SUPPORT=OFF -DOJPH_BUILD_TESTS=ON ..
       working-directory: build
     - name: build
       run: cmake --build . --config Release
@@ -55,241 +171,74 @@ jobs:
       run: ctest --output-on-failure -C Release
       working-directory: build
 
-#jobs:
-#  ci:
-#    name: ${{ matrix.name }}
-#    runs-on: ${{ matrix.os }}
-#
-#    strategy:
-#      fail-fast: false
-#      matrix:
-#        # Github Actions requires a single row to be added to the build matrix.
-#        # See https://help.github.com/en/articles/workflow-syntax-for-github-actions.
-#        name: [
-#          ubuntu-18.04-gcc-4.8,
-#          ubuntu-18.04-gcc-4.9,
-#          ubuntu-18.04-gcc-5,
-#          ubuntu-18.04-gcc-6,
-#          ubuntu-18.04-gcc-7,
-#          ubuntu-18.04-gcc-8,
-#          ubuntu-18.04-gcc-9,
-#          ubuntu-18.04-clang-3.5,
-#          ubuntu-18.04-clang-3.6,
-#          ubuntu-18.04-clang-3.7,
-#          ubuntu-18.04-clang-3.8,
-#          ubuntu-18.04-clang-3.9,
-#          ubuntu-18.04-clang-4.0,
-#          ubuntu-18.04-clang-5.0,
-#          ubuntu-18.04-clang-6.0,
-#          ubuntu-18.04-clang-7,
-#          ubuntu-18.04-clang-8,
-#          ubuntu-18.04-clang-9,
-#          macOS-10.14-xcode-9.4.1,
-#          macOS-10.14-xcode-10.0,
-#          macOS-10.14-xcode-10.1,
-#          macOS-10.14-xcode-10.2,
-#          macOS-10.14-xcode-10.2.1,
-#          macOS-10.14-xcode-10.3,
-#          macOS-10.14-gcc-7,
-#          macOS-10.14-gcc-8,
-#          macOS-10.14-gcc-9,
-#        ]
-#
-#        include:
-#          - name: ubuntu-18.04-gcc-4.8
-#            os: ubuntu-18.04
-#            compiler: gcc
-#            version: "4.8"
-#
-#          - name: ubuntu-18.04-gcc-4.9
-#            os: ubuntu-18.04
-#            compiler: gcc
-#            version: "4.9"
-#
-#          - name: ubuntu-18.04-gcc-5
-#            os: ubuntu-18.04
-#            compiler: gcc
-#            version: "5"
-#
-#          - name: ubuntu-18.04-gcc-6
-#            os: ubuntu-18.04
-#            compiler: gcc
-#            version: "6"
-#
-#          - name: ubuntu-18.04-gcc-7
-#            os: ubuntu-18.04
-#            compiler: gcc
-#            version: "7"
-#
-#          - name: ubuntu-18.04-gcc-8
-#            os: ubuntu-18.04
-#            compiler: gcc
-#            version: "8"
-#
-#          - name: ubuntu-18.04-gcc-9
-#            os: ubuntu-18.04
-#            compiler: gcc
-#            version: "9"
-#
-#          - name: ubuntu-18.04-clang-3.5
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "3.5"
-#
-#          - name: ubuntu-18.04-clang-3.6
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "3.6"
-#
-#          - name: ubuntu-18.04-clang-3.7
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "3.7"
-#
-#          - name: ubuntu-18.04-clang-3.8
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "3.8"
-#
-#          - name: ubuntu-18.04-clang-3.9
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "3.9"
-#
-#          - name: ubuntu-18.04-clang-4.0
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "4.0"
-#
-#          - name: ubuntu-18.04-clang-5.0
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "5.0"
-#
-#          - name: ubuntu-18.04-clang-6.0
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "6.0"
-#
-#          - name: ubuntu-18.04-clang-7
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "7"
-#
-#          - name: ubuntu-18.04-clang-8
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "8"
-#
-#          - name: ubuntu-18.04-clang-9
-#            os: ubuntu-18.04
-#            compiler: clang
-#            version: "9"
-#
-#          - name: macOS-10.14-xcode-9.4.1
-#            os: macOS-10.14
-#            compiler: xcode
-#            version: "9.4.1"
-#
-#          - name: macOS-10.14-xcode-10.0
-#            os: macOS-10.14
-#            compiler: xcode
-#            version: "10"
-#
-#          - name: macOS-10.14-xcode-10.1
-#            os: macOS-10.14
-#            compiler: xcode
-#            version: "10.1"
-#
-#          - name: macOS-10.14-Xcode-10.2
-#            os: macOS-10.14
-#            compiler: xcode
-#            version: "10.2"
-#
-#          - name: macOS-10.14-xcode-10.2.1
-#            os: macOS-10.14
-#            compiler: xcode
-#            version: "10.2.1"
-#
-#          - name: macOS-10.14-xcode-10.3
-#            os: macOS-10.14
-#            compiler: xcode
-#            version: "10.3"
-#
-#          - name: macOS-10.14-gcc-7
-#            os: macOS-10.14
-#            compiler: gcc
-#            version: "7"
-#
-#          - name: macOS-10.14-gcc-8
-#            os: macOS-10.14
-#            compiler: gcc
-#            version: "8"
-#
-#          - name: macOS-10.14-gcc-9
-#            os: macOS-10.14
-#            compiler: gcc
-#            version: "9"
-#
-#    steps:
-#    - uses: actions/checkout@v2
-#    - name: cmake
-#      run: cmake ..
-#      working-directory: build
-#    - name: build
-#      run: make
-#      working-directory: build
-
+  test_msys2:
+    strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          { system: Windows-MSYS2,  runner: windows-latest },
+        ]
+    name: ${{ matrix.system }} Test
+    runs-on: ${{ matrix.runner }}
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.job_to_run != 'build_arm32' }}
+    defaults:
+      run:
+        shell: msys2 {0}
+    steps:
+    - uses: actions/checkout@v5
+    - uses: msys2/setup-msys2@v2
+      with:
+        msystem: UCRT64
+        update: false
+        pacboy: cc:p cmake:p python:p
+    - name: cmake
+      run: cmake -DOJPH_ENABLE_TIFF_SUPPORT=OFF -DOJPH_BUILD_TESTS=ON -DPython3_EXECUTABLE=${MINGW_PREFIX}/bin/python.exe ..
+      working-directory: build
+    - name: build
+      run: cmake --build . --config Release
+      working-directory: build
+    - name: test
+      run: ctest --output-on-failure -C Release
+      working-directory: build
 
+  test_windows_on_arm:
+    strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          { system: WindowsOnARM,  runner: windows-11-arm },
+        ]
+    name: ${{ matrix.system }} Test
+    runs-on: ${{ matrix.runner }}
+    if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.job_to_run != 'build_arm32' }}
+    steps:
+    - uses: actions/checkout@v5
+    - name: cmake
+      run: cmake -G "Visual Studio 17 2022" -A ARM64 -DOJPH_ENABLE_TIFF_SUPPORT=OFF -DOJPH_BUILD_TESTS=ON ..
+      working-directory: build
+    - name: build
+      run: cmake --build . --config Release
+      working-directory: build
+    - name: test
+      run: ctest --output-on-failure -C Release
+      working-directory: build
 
-#  build1:
-#    name: main build for Unix-like
-#    runs-on: ${{ matrix.os }}
-#    strategy:
-#      matrix:
-#        os: [macos-10.14, macos-latest, ubuntu-16.04, ubuntu-latest]
-#        node: [8]
-#    steps:
-#    - uses: actions/checkout@v2
-#    - name: cmake
-#      run: cmake ..
-#      working-directory: build
-#    - name: build
-#      run: make
-#      working-directory: build
-#
-#  build2:
-#    name: main build for Windows
-#    runs-on: ${{ matrix.os }}
-#    strategy:
-#      matrix:
-#        os: [windows-latest]
-#        node: [8]
-#    steps:
-#    - uses: actions/checkout@v2
-#    - name: install visual studio
-#      run: python install.py --clang-completer --ts-completer --msvc=14
-#    - name: cmake
-#      run: cmake -G "Visual Studio 14 2015 Win64" ..
-#      working-directory: build
-#    - name: build
-#      run: cmake --build .
-#      working-directory: build
-#
-#  build3:
-#    name: main build for Windows
-#    runs-on: ${{ matrix.os }}
-#    strategy:
-#      matrix:
-#        os: [windows-latest]
-#        node: [8]
-#    steps:
-#    - uses: actions/checkout@v2
-#    - name: install visual studio
-#      run: python install.py --clang-completer --ts-completer --msvc=14
-#    - name: cmake
-#      run: cmake -G "Visual Studio 15 2017 Win64" ..
-#      working-directory: build
-#    - name: build
-#      run: cmake --build .
-#      working-directory: build
+  build_arm32:
+    name: Linux-ARM32 Build
+    runs-on: ubuntu-latest
+    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.job_to_run == 'build_arm32' }}
+    steps:
+    - uses: actions/checkout@v5
+    - uses: uraimo/run-on-arch-action@v2
+      with:
+        arch: armv7
+        distro: ubuntu22.04
+        githubToken: ${{ github.token }}
+        install: |
+          apt-get update -q -y
+          apt-get install -q -y cmake make g++ libtiff-dev python3
+        run: |
+          cd build
+          cmake -DCMAKE_BUILD_TYPE=Release -DOJPH_BUILD_STREAM_EXPAND=ON -DOJPH_ENABLE_TIFF_SUPPORT=OFF -DOJPH_BUILD_TESTS=ON ..
+          make
+          ctest --output-on-failure
diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
new file mode 100644
index 00000000..ff61ac5a
--- /dev/null
+++ b/.github/workflows/cifuzz.yml
@@ -0,0 +1,24 @@
+name: CIFuzz
+on: [pull_request]
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+      - name: Build Fuzzers
+        id: build
+        uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+        with:
+          oss-fuzz-project-name: "openjph"
+      - name: Run Fuzzers
+        uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+        with:
+          oss-fuzz-project-name: "openjph"
+          fuzz-seconds: 600
+      - name: Upload Crash
+        uses: actions/upload-artifact@v4
+        if: failure() && steps.build.outcome == 'success'
+        with:
+          name: artifacts
+          path: ./out/artifacts
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index fe9d084c..903c9b79 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -38,7 +38,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        language: [ 'cpp', 'python' ]
+        language: [ 'cpp' ]
         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby', 'swift' ]
         # Use only 'java' to analyze code written in Java, Kotlin or both
         # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
@@ -46,11 +46,11 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+      uses: github/codeql-action/init@v4
       with:
         languages: ${{ matrix.language }}
         # If you wish to specify custom queries, you can do so here or in a config file.
@@ -64,7 +64,7 @@ jobs:
     # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
-      uses: github/codeql-action/autobuild@v2
+      uses: github/codeql-action/autobuild@v4
 
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
@@ -77,6 +77,6 @@ jobs:
     #     ./location_of_script_within_repo/buildscript.sh
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+      uses: github/codeql-action/analyze@v4
       with:
         category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/emcc.yml b/.github/workflows/emcc.yml
new file mode 100644
index 00000000..48298f61
--- /dev/null
+++ b/.github/workflows/emcc.yml
@@ -0,0 +1,47 @@
+name: Build with EMCC
+
+on:
+  push:
+  pull_request:
+    types: [opened, reopened]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Configure emcc
+        uses: mymindstorm/setup-emsdk@v14
+        with:
+          actions-cache-folder: 'emsdk-cache'
+
+      - name: Build non-SIMD and Debug
+        run: |
+          cd build
+          emcmake cmake .. --fresh -DOJPH_DISABLE_SIMD=ON -DCMAKE_BUILD_TYPE=Debug
+          cmake --build . --config Debug --clean-first
+          cd ..
+
+      - name: Build non-SIMD and Release
+        run: |
+          cd build
+          emcmake cmake .. --fresh -DOJPH_DISABLE_SIMD=ON -DCMAKE_BUILD_TYPE=Release
+          cmake --build . --config Release --clean-first
+          cd ..
+
+      - name: Build SIMD and Debug
+        run: |
+          cd build
+          emcmake cmake .. --fresh -DOJPH_DISABLE_SIMD=OFF -DCMAKE_BUILD_TYPE=Debug
+          cmake --build . --config Debug --clean-first
+          cd ..
+
+      - name: Build SIMD and Release
+        run: |
+          cd build
+          emcmake cmake .. --fresh -DOJPH_DISABLE_SIMD=OFF -DCMAKE_BUILD_TYPE=Release
+          cmake --build . --config Release --clean-first
+          cd ..
diff --git a/.gitignore b/.gitignore
index 627f2909..c101eb7d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 .DS_Store
 mytest/*
 others/*
+lib/*
 
 .vscode
 build.sh
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dbdeff43..7d375a63 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,36 +1,101 @@
-cmake_minimum_required(VERSION 3.11.0)
+cmake_minimum_required(VERSION 3.12.0)
 
-project (openjph DESCRIPTION "Open source implementation of JPH" LANGUAGES CXX)
+## Library name/version
+include(ojph_version.cmake)
+
+## project
+project (openjph VERSION ${OPENJPH_VERSION} DESCRIPTION "Open source implementation of JPH" LANGUAGES C CXX)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 ################################################################################################
 # Building OpenJPH
 ################################################################################################
 
-############################################################
-# Parse version file
-# credit: https://stackoverflow.com/a/47084079
-
-file(READ "${CMAKE_CURRENT_SOURCE_DIR}/src/core/common/ojph_version.h" VERFILE)
-if (NOT VERFILE)
-    message(FATAL_ERROR "Failed to parse ojph_version.h!")
+## Target architecture
+# We use the target architecture to help with arranging files in "source_group" commands.
+# The code does not use the results provided by target_arch.cmake, and relies, instead,
+# on its own logic, which matches that in target_arch.cmake, to identify the architecture
+include(target_arch.cmake)
+target_architecture(OJPH_TARGET_ARCH)
+message(STATUS "CPU Architecture is ${OJPH_TARGET_ARCH}")
+
+## Building for multi-generation
+# This is useful for when we are building a multi-architecture build, such as when using
+# the -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" build configuration
+if (CMAKE_OSX_ARCHITECTURES)
+  list(FIND CMAKE_OSX_ARCHITECTURES "x86_64" x86_64_index)
+  if (${x86_64_index} GREATER -1)
+    set(MULTI_GEN_X86_64 TRUE)
+  endif()
+  unset(x86_64_index)
+  list(FIND CMAKE_OSX_ARCHITECTURES "arm64" arm64_index)
+  if (${arm64_index} GREATER -1)
+    set(MULTI_GEN_ARM64 TRUE)
+  endif()
+  unset(arm64_index)
 endif()
 
-string(REGEX MATCH "OPENJPH_VERSION_MAJOR ([0-9]*)" _ ${VERFILE})
-set(OPENJPH_VERSION_MAJOR ${CMAKE_MATCH_1})
-string(REGEX MATCH "OPENJPH_VERSION_MINOR ([0-9]*)" _ ${VERFILE})
-set(OPENJPH_VERSION_MINOR ${CMAKE_MATCH_1})
-string(REGEX MATCH "OPENJPH_VERSION_PATCH ([a-z0-9]*)" _ ${VERFILE})
-set(OPENJPH_VERSION_PATCH ${CMAKE_MATCH_1})
-
-set(OPENJPH_VERSION "${OPENJPH_VERSION_MAJOR}.${OPENJPH_VERSION_MINOR}.${OPENJPH_VERSION_PATCH}")
-############################################################
-
-option(OJPH_DISABLE_INTEL_SIMD "Disables the use of SIMD instructions and associated files" OFF)
-option(OJPH_ENABLE_INTEL_AVX512 "enables the use of AVX512 SIMD instructions and associated files" ON)
+## options
 option(BUILD_SHARED_LIBS "Shared Libraries" ON)
 option(OJPH_ENABLE_TIFF_SUPPORT "Enables input and output support for TIFF files" ON)
 option(OJPH_BUILD_TESTS "Enables building test code" OFF)
 option(OJPH_BUILD_EXECUTABLES "Enables building command line executables" ON)
+option(OJPH_BUILD_STREAM_EXPAND "Enables building ojph_stream_expand executable" OFF)
+option(OJPH_BUILD_FUZZER "Enables building oss-fuzzing target executable" OFF)
+
+option(OJPH_DISABLE_SIMD "Disables the use of SIMD instructions -- agnostic to architectures" OFF)
+option(OJPH_DISABLE_SSE "Disables the use of SSE SIMD instructions and associated files" OFF)
+option(OJPH_DISABLE_SSE2 "Disables the use of SSE2 SIMD instructions and associated files" OFF)
+option(OJPH_DISABLE_SSSE3 "Disables the use of SSSE3 SIMD instructions and associated files" OFF)
+option(OJPH_DISABLE_SSE4 "Disables the use of SSE4 SIMD instructions and associated files" OFF)
+option(OJPH_DISABLE_AVX "Disables the use of AVX SIMD instructions and associated files" OFF)
+option(OJPH_DISABLE_AVX2 "Disables the use of AVX2 SIMD instructions and associated files" OFF)
+option(OJPH_DISABLE_AVX512 "Disables the use of AVX512 SIMD instructions and associated files" OFF)
+option(OJPH_DISABLE_NEON "Disables the use of NEON SIMD instructions and associated files" OFF)
+
+## options that are being deprecated
+if (DEFINED OJPH_DISABLE_INTEL_SIMD)
+  message(STATUS "OJPH_DISABLE_INTEL_SIMD is being deprecated. Instead, use \"OJPH_DISABLE_SIMD\", "
+                 "which is architecture agnostic. If you do not specify any, the default is "
+                 "OJPH_DISABLE_SIMD=OFF.")
+  set(OJPH_DISABLE_SIMD ${OJPH_DISABLE_INTEL_SIMD})
+  message(STATUS "OJPH_DISABLE_SIMD is set to ${OJPH_DISABLE_SIMD}")
+  unset(OJPH_DISABLE_INTEL_SIMD)
+endif()
+if (DEFINED OJPH_ENABLE_INTEL_AVX512)
+  message(STATUS "OJPH_ENABLE_INTEL_AVX512 is being deprecated, use \"OJPH_DISABLE_AVX512\" instead."
+                 "If you do not specify any, the default is OJPH_DISABLE_AVX512=OFF.")
+  if (OJPH_ENABLE_INTEL_AVX512)
+    set(OJPH_DISABLE_AVX512 OFF)
+  else()
+    set(OJPH_DISABLE_AVX512 ON)
+  endif()
+  message(STATUS "OJPH_DISABLE_AVX512 is set to ${OJPH_DISABLE_AVX512}")
+  unset(OJPH_ENABLE_INTEL_AVX512)
+endif()
+
+## Setting some of the options if EMSCRIPTEN is the compiler
+# In previous releases, the cmake script used to produce both non-SIMD and
+# SIMD builds in one go.  At the time of this writing, all interpreters and
+# compilers of WASM code, such as web-browser and node, support SIMD, therefore
+# it is time to make the SIMD build the default.  In other words, this cmake
+# script builds only WASM SIMD code by default, if desired, a non-SIMD build
+# can be generated using the OJPH_DISABLE_SIMD option (in this case, the
+# WASM SIMD code is not generated).
+# It is worth remembering that the SIMD/non-SIMD issue arose because it is
+# NOT possible to have multiple execution paths in the code, one for non-SIMD
+# and one for SIMD, as we do for CPUs, letting the program select, at run-time,
+# the best path to follow.
+if(EMSCRIPTEN)
+  set(BUILD_SHARED_LIBS OFF)
+  set(OJPH_ENABLE_TIFF_SUPPORT OFF)
+  set(OJPH_BUILD_STREAM_EXPAND OFF)
+  if (OJPH_DISABLE_SIMD)
+    set(OJPH_ENABLE_WASM_SIMD OFF)
+  else()
+    set(OJPH_ENABLE_WASM_SIMD ON)
+  endif()
+endif()
 
 # This is related to how the timestamp is set for URL downloaded files.
 # Set DOWNLOAD_EXTRACT_TIMESTAMP
@@ -40,255 +105,146 @@ if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0")
   endif()
 endif()
 
+## Added by Michael Smith
 set(CMAKE_CXX_FLAGS_ASAN
   "-fsanitize=address -fno-optimize-sibling-calls -fsanitize-address-use-after-scope -fno-omit-frame-pointer -g -O1"
   CACHE STRING "Flags used by the C++ compiler during AddressSanitizer builds."
   FORCE)
 
+## Build type
 if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release")
-  message( STATUS "To use AddressSanitizer, use \"cmake .. -DCMAKE_BUILD_TYPE=asan\"" )
+  message(STATUS "To use AddressSanitizer, use \"cmake .. -DCMAKE_BUILD_TYPE=asan\"" )
 endif()
 message(STATUS "Building ${CMAKE_BUILD_TYPE}")
 
-set(CMAKE_CXX_STANDARD 14)
-if (MSVC)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /D \"_CRT_SECURE_NO_WARNINGS\"")
-endif()
-if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -Wall -Wextra -Wconversion -Wunused-parameter")
-endif()
-
-if (OJPH_DISABLE_INTEL_SIMD)
-	if (MSVC)
-		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D \"OJPH_DISABLE_INTEL_SIMD\"")
-	else()
-		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DOJPH_DISABLE_INTEL_SIMD")
-	endif()
-endif()
-
-if (OJPH_ENABLE_INTEL_AVX512)
-	if (MSVC)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D \"OJPH_ENABLE_INTEL_AVX512\"")
-	else()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DOJPH_ENABLE_INTEL_AVX512")
-	endif()
+## C++ version and flags
+# C11 is needed for aligned_alloc
+if (NOT CMAKE_C_STANDARD)
+  set(CMAKE_C_STANDARD 11)
 endif()
-
-if (BUILD_SHARED_LIBS AND MSVC)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D \"OJPH_BUILD_SHARED_LIBRARY\"")
-endif()
-
-if (OJPH_CODE_COVERAGE AND NOT MSVC)
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage")
+message(STATUS "C Standard is set to ${CMAKE_C_STANDARD}")
+# C++14 is needed for gtest, otherwise, C++11 is sufficient for the library
+if (NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 14)
 endif()
+message(STATUS "C++ Standard is set to ${CMAKE_CXX_STANDARD}")
 
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/../bin)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/../bin)
-
-include_directories(src/core/common)
-include_directories(src/apps/common)
-
-file(GLOB CODESTREAM       "src/core/codestream/*.cpp" "src/core/codestream/*.h")
-file(GLOB CODESTREAM_SSE   "src/core/codestream/*_sse.cpp")
-file(GLOB CODESTREAM_SSE2  "src/core/codestream/*_sse2.cpp")
-file(GLOB CODESTREAM_AVX   "src/core/codestream/*_avx.cpp")
-file(GLOB CODESTREAM_AVX2  "src/core/codestream/*_avx2.cpp")
-file(GLOB CODESTREAM_WASM  "src/core/codestream/*_wasm.cpp")
-file(GLOB CODING           "src/core/coding/*.cpp" "src/core/coding/*.h")
-file(GLOB CODING_SSSE3     "src/core/coding/*_ssse3.cpp")
-file(GLOB CODING_WASM      "src/core/coding/*_wasm.cpp")
-file(GLOB CODING_AVX512    "src/core/coding/*_avx512.cpp")
-file(GLOB COMMON           "src/core/common/*.h")
-file(GLOB OTHERS           "src/core/others/*.cpp")
-file(GLOB TRANSFORM        "src/core/transform/*.cpp" "src/core/transform/*.h")
-file(GLOB TRANSFORM_SSE    "src/core/transform/*_sse.cpp")
-file(GLOB TRANSFORM_SSE2   "src/core/transform/*_sse2.cpp")
-file(GLOB TRANSFORM_AVX    "src/core/transform/*_avx.cpp")
-file(GLOB TRANSFORM_AVX2   "src/core/transform/*_avx2.cpp")
-file(GLOB TRANSFORM_WASM   "src/core/transform/*_wasm.cpp")
-
-list(REMOVE_ITEM CODESTREAM ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODESTREAM_WASM})
-list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX512})
-list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_WASM})
-list(APPEND SOURCES ${CODESTREAM} ${CODING} ${COMMON} ${OTHERS} ${TRANSFORM})
-
-source_group("codestream"        FILES ${CODESTREAM})
-source_group("coding"            FILES ${CODING})
-source_group("common"            FILES ${COMMON})
-source_group("others"            FILES ${OTHERS})
-source_group("transform"         FILES ${TRANSFORM})
-
-configure_file(
-  "${CMAKE_CURRENT_SOURCE_DIR}/src/pkg-config.pc.cmake"
-  "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/${PROJECT_NAME}.pc"
-)
-
-if(EMSCRIPTEN)
-  set(OJPH_DISABLE_INTEL_SIMD ON)
-  set(BUILD_SHARED_LIBS OFF)
-  set(OJPH_ENABLE_TIFF_SUPPORT OFF)
-  add_compile_options(-std=c++11 -O3 -fexceptions -DOJPH_DISABLE_INTEL_SIMD)
-  add_library(openjph ${SOURCES})
-  add_library(openjphsimd ${SOURCES} ${CODESTREAM_WASM} ${CODING_WASM} ${TRANSFORM_WASM})
-  target_include_directories(openjph PUBLIC src/core/common)
-  target_include_directories(openjphsimd PUBLIC src/core/common)
-  target_compile_options(openjphsimd PRIVATE -DOJPH_ENABLE_WASM_SIMD -msimd128)
-  source_group("codestream" FILES ${CODESTREAM_WASM})
-  source_group("coding" FILES ${CODING_WASM})
-  source_group("transform" FILES ${TRANSFORM_WASM})
-elseif(NOT OJPH_DISABLE_INTEL_SIMD)
-  add_library(openjph ${SOURCES} ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODING_SSSE3} ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2})
-  source_group("codestream" FILES ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2})
-  source_group("coding" FILES ${CODING_SSSE3})
-  source_group("transform" FILES ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2})
-  if (OJPH_ENABLE_INTEL_AVX512)
-    target_sources(openjph PRIVATE ${CODING_AVX512})
-    source_group("coding" FILES ${CODING_AVX512})
+if (MSVC)
+  add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+endif()
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU" AND NOT MSVC)
+  add_compile_options(
+  -fexceptions
+  -Wall
+  -Wextra
+  -Wconversion
+  -Wunused-parameter
+  )
+endif()
+if (EMSCRIPTEN)
+  add_compile_options(-fexceptions)
+  if(OJPH_ENABLE_WASM_SIMD)
+    add_compile_options(-DOJPH_ENABLE_WASM_SIMD -msimd128)
   endif()
-else()
-  add_library(openjph ${SOURCES})
 endif()
 
-target_include_directories(openjph PUBLIC src/core/common)
-
-target_compile_definitions(openjph PUBLIC _FILE_OFFSET_BITS=64)
-
-if (OPENJPH_VERSION)
-  if (WIN32)
-    set_target_properties(openjph
-      PROPERTIES
-        OUTPUT_NAME "openjph.${OPENJPH_VERSION_MAJOR}.${OPENJPH_VERSION_MINOR}")
-  else()
-    set_target_properties(openjph
-      PROPERTIES
-        SOVERSION "${OPENJPH_VERSION_MAJOR}.${OPENJPH_VERSION_MINOR}"
-        VERSION "${OPENJPH_VERSION}")
-  endif()
+## Enhanced instruction options
+if (OJPH_DISABLE_SIMD)
+  add_compile_definitions(OJPH_DISABLE_SIMD)
 else()
-  message(FATAL_ERROR "OPENJPH_VERSION is not set")
+  if(OJPH_DISABLE_SSE)
+    add_compile_definitions(OJPH_DISABLE_SSE)
+  endif()
+  if(OJPH_DISABLE_SSE2)
+    add_compile_definitions(OJPH_DISABLE_SSE2)
+  endif()
+  if(OJPH_DISABLE_SSSE3)
+    add_compile_definitions(OJPH_DISABLE_SSSE3)
+  endif()
+  if(OJPH_DISABLE_SSE4)
+    add_compile_definitions(OJPH_DISABLE_SSE4)
+  endif()
+  if(OJPH_DISABLE_AVX)
+    add_compile_definitions(OJPH_DISABLE_AVX)
+  endif()
+  if(OJPH_DISABLE_AVX2)
+    add_compile_definitions(OJPH_DISABLE_AVX2)
+  endif()
+  if(OJPH_DISABLE_AVX512)
+    add_compile_definitions(OJPH_DISABLE_AVX512)
+  endif()
+  if(OJPH_DISABLE_NEON)
+    add_compile_definitions(OJPH_DISABLE_NEON)
+  endif()
 endif()
 
-if (MSVC)
-  set_source_files_properties(src/core/codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX")
-  set_source_files_properties(src/core/codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
-  set_source_files_properties(src/core/coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512")
-  set_source_files_properties(src/core/transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX")
-  set_source_files_properties(src/core/transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
-  set_source_files_properties(src/core/transform/ojph_transform_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX")
-  set_source_files_properties(src/core/transform/ojph_transform_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
-else()
-  set_source_files_properties(src/core/codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
-  set_source_files_properties(src/core/codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
-  set_source_files_properties(src/core/coding/ojph_block_decoder_ssse3.cpp PROPERTIES COMPILE_FLAGS -mssse3)
-  set_source_files_properties(src/core/coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx512cd)
-  set_source_files_properties(src/core/transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
-  set_source_files_properties(src/core/transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
-  set_source_files_properties(src/core/transform/ojph_transform_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
-  set_source_files_properties(src/core/transform/ojph_transform_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+## Build library and applications
+include(GNUInstallDirs)
+add_subdirectory(src/core)
+if (OJPH_BUILD_EXECUTABLES)
+  add_subdirectory(src/apps)
 endif()
 
-############################################################
-if( OJPH_ENABLE_TIFF_SUPPORT )
-
-  if( WIN32 )
-
-    set(TIFF_INCLUDE_DIR "C:\\Program Files\\tiff\\include" CACHE PATH "the directory containing the TIFF headers")
-    set(TIFF_LIBRARY_DEBUG   "C:\\Program Files\\tiff\\lib\\tiffd.lib" CACHE FILEPATH "the path to the TIFF library for debug configurations")
-    set(TIFF_LIBRARY_RELEASE "C:\\Program Files\\tiff\\lib\\tiff.lib"  CACHE FILEPATH "the path to the TIFF library for release configurations")
-    set(TIFFXX_LIBRARY_DEBUG  "C:\\Program Files\\tiff\\lib\\tiffxxd.lib" CACHE FILEPATH "the path to the TIFFXX  library for debug configurations")
-    set(TIFFXX_LIBRARY_RELEASE "C:\\Program Files\\tiff\\lib\\tiffxx.lib" CACHE FILEPATH "the path to the TIFFXX  library for release configurations")
-
-    message( STATUS "WIN32 detected: Setting CMakeCache TIFF values as follows, use CMake-gui Advanced to modify them" )
-    message( STATUS "   TIFF_INCLUDE_DIR : \"${TIFF_INCLUDE_DIR}\"  " )
-    message( STATUS "   TIFF_LIBRARY_DEBUG : \"${TIFF_LIBRARY_DEBUG}\"  " )
-    message( STATUS "   TIFF_LIBRARY_RELEASE : \"${TIFF_LIBRARY_RELEASE}\"  " )
-    message( STATUS "   TIFFXX_LIBRARY_DEBUG : \"${TIFFXX_LIBRARY_DEBUG}\"  " )
-    message( STATUS "   TIFFXX_LIBRARY_RELEASE : \"${TIFFXX_LIBRARY_RELEASE}\"  " )
-
-  endif( WIN32 )
+################################################################################################
+# Install
+################################################################################################
 
-  FIND_PACKAGE( TIFF )
+install(EXPORT openjph-targets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/openjph
+)
 
-  if( TIFF_FOUND )
-    set(USE_TIFF TRUE CACHE BOOL "Add TIFF support")
-    include_directories( ${TIFF_INCLUDE_DIR} ) 
-    if (MSVC)
-		  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D \"OJPH_ENABLE_TIFF_SUPPORT\"")
-	  else()
-		  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DOJPH_ENABLE_TIFF_SUPPORT")
-	  endif()
-    #include_directories(${CMAKE_BINARY_DIR}/libtiff) # for tiffconf.h on windows
-  endif( TIFF_FOUND )
+include(CMakePackageConfigHelpers)
 
-endif() 
-############################################################
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/src/openjph-config.cmake.in
+  "${CMAKE_CURRENT_BINARY_DIR}/openjph-config.cmake"
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/openjph
+)
 
-set(OJPH_EXPAND src/apps/ojph_expand/ojph_expand.cpp src/apps/others/ojph_img_io.cpp)
-set(OJPH_COMPRESS src/apps/ojph_compress/ojph_compress.cpp src/apps/others/ojph_img_io.cpp)
-set(OJPH_IMG_IO_SSE41 src/apps/others/ojph_img_io_sse41.cpp)
-set(OJPH_IMG_IO_AVX2 src/apps/others/ojph_img_io_avx2.cpp)
+write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/openjph-config-version.cmake
+                                 COMPATIBILITY SameMinorVersion)
 
-if(NOT OJPH_DISABLE_INTEL_SIMD)
-  list(APPEND OJPH_EXPAND ${OJPH_IMG_IO_SSE41})
-  list(APPEND OJPH_EXPAND ${OJPH_IMG_IO_AVX2})
-  list(APPEND OJPH_COMPRESS ${OJPH_IMG_IO_SSE41})
-  list(APPEND OJPH_COMPRESS ${OJPH_IMG_IO_AVX2})
-endif()
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/openjph-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/openjph-config-version.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/openjph
+)
 
-if(OJPH_BUILD_EXECUTABLES)
-  add_executable(ojph_expand ${OJPH_EXPAND})
-  add_executable(ojph_compress ${OJPH_COMPRESS})
+if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
+  set(PKG_CONFIG_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}")
+else()
+  set(PKG_CONFIG_INCLUDEDIR "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
 endif()
 
-if (MSVC)
-  set_source_files_properties(src/apps/others/ojph_img_io_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
+  set(PKG_CONFIG_LIBDIR "${CMAKE_INSTALL_LIBDIR}")
 else()
-  set_source_files_properties(src/apps/others/ojph_img_io_sse41.cpp PROPERTIES COMPILE_FLAGS -msse4.1)
-  set_source_files_properties(src/apps/others/ojph_img_io_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+  set(PKG_CONFIG_LIBDIR "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
 endif()
 
-if(OJPH_BUILD_EXECUTABLES)
-  if( USE_TIFF )
-    target_link_libraries(ojph_expand PUBLIC openjph ${TIFF_LIBRARIES})
-    target_link_libraries(ojph_compress PUBLIC openjph ${TIFF_LIBRARIES})
-  else()
-    target_link_libraries(ojph_expand PUBLIC openjph)
-    target_link_libraries(ojph_compress PUBLIC openjph)
-  endif()
-endif()
+configure_file(
+  "${CMAKE_CURRENT_SOURCE_DIR}/src/openjph.pc.in"
+  "${CMAKE_BINARY_DIR}/${PROJECT_NAME}.pc"
+  @ONLY
+)
 
+install(FILES "${CMAKE_BINARY_DIR}/${PROJECT_NAME}.pc"
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig
+)
 
 ################################################################################################
-# Install
+# Testing and fuzzing (OJPH_BUILD_TESTS)
 ################################################################################################
 
-if(OJPH_BUILD_EXECUTABLES)
-  install(TARGETS ojph_expand
-    DESTINATION bin)
-
-  install(TARGETS ojph_compress
-          DESTINATION bin)
+if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND OJPH_BUILD_TESTS)
+  enable_testing()
+  add_subdirectory(tests)
 endif()
 
-include(GNUInstallDirs)
-install(TARGETS openjph LIBRARY
-  DESTINATION ${CMAKE_INSTALL_LIBDIR})
-
-install (DIRECTORY src/core/common/
-  DESTINATION include/openjph
-  FILES_MATCHING
-  PATTERN "*.h")
-
-install(FILES "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/${PROJECT_NAME}.pc"
-  DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
-
 ################################################################################################
-# Testing (OJPH_BUILD_TESTS)
+# Testing and fuzzing (OJPH_BUILD_FUZZER)
 ################################################################################################
 
-if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND OJPH_BUILD_TESTS)
-  enable_testing()
-  add_subdirectory(tests)
+if(OJPH_BUILD_FUZZER)
+  add_subdirectory(fuzzing)
 endif()
 
diff --git a/README.md b/README.md
index b07f568c..19811cac 100644
--- a/README.md
+++ b/README.md
@@ -4,131 +4,21 @@
 
 Open source implementation of High-throughput JPEG2000 (HTJ2K), also known as JPH, JPEG2000 Part 15, ISO/IEC 15444-15, and ITU-T T.814. Here, we are interested in implementing the HTJ2K only, supporting features that are defined in JPEG2000 Part 1 (for example, for wavelet transform, only reversible 5/3 and irreversible 9/7 are supported).
 
-The interested reader is referred to the [short HTJ2K white paper](http://ds.jpeg.org/whitepapers/jpeg-htj2k-whitepaper.pdf), or the [extended HTJ2K white paper](https://htj2k.com/wp-content/uploads/white-paper.pdf) for more details on HTJ2K. [This](https://kakadusoftware.com/wp-content/uploads/2019/09/icip2019.pdf) paper explores the attainable performance on CPU, and [this](https://kakadusoftware.com/wp-content/uploads/2019/09/ICIP2019_GPU.pdf) paper for decoding on a GPU.
+The interested reader is referred to the [short HTJ2K white paper](http://ds.jpeg.org/whitepapers/jpeg-htj2k-whitepaper.pdf), or the [extended HTJ2K white paper](https://htj2k.com/wp-content/uploads/white-paper.pdf) for more details on HTJ2K. [This](https://kakadusoftware.com/wp-content/uploads/icip2019.pdf) paper explores the attainable performance on CPU, while [this](https://kakadusoftware.com/wp-content/uploads/ICIP2019_GPU.pdf) and [this](http://hdl.handle.net/1959.4/unsworks_75139) explores performance on the GPU.
 
-# Status #
-
-The code is written in C++; the color and wavelet transform steps can employ SIMD instructions on Intel platforms.  It conceivable that at some point in the future, SIMD instructions are employed to improve performance of the block (de)coder, and/or for platforms other than Intel.  As it stands, on Intel Skylake i7-6700, encoding 4K 4:4:4 HDR images losslessly takes around 0.5s, and decoding takes around 0.34s; for lossy compression, performance depends on the quantisation step size (qstep), but for a high-quality image at a bitrate of around 3 bits/pixel, encoding takes around 0.27s and decoding takes 0.22s.
-
-As it stands, the OpenJPH library needs documentation. The provided encoder ojph\_compress only generates HTJ2K codestreams, with the extension j2c; the generated files lack the .jph header.  Adding the .jph header is of little urgency, as the codestream contains all needed information to properly decode an image.  The .jph header will be added at a future point in time.  The provided decoder ojph\_expand decodes .jph files, by ignoring the .jph header if it is present.
-
-The provided command line tools ojph\_compress and ojph\_expand accepts and generated .pgm, .ppm, and .yuv. See the usage examples below.
-
-# Web-based Demos #
-
-The associate site [openjph.org](https://openjph.org) serves as a blog.  It currently host the [javascript](https://openjph.org/javascript/demo.html) demo of the decoder; the webpage demonstrates that the library can be compiled to javascript, and can run inside a web-browser.  Any browser supporting webassembly can be used to view this webpage; examples include Firefox, Chrome, Safari, and Edge, on a desktop, mobile, or tablet.
-
-Another project of interest is the [openjphjs](https://github.com/chafey/openjphjs) project, developed by [Chris](https://github.com/chafey).  You can see [there](https://chafey.github.io/openjphjs/test/browser/index.html) a nice online demonstration of javascript-based HTJ2K encoding/decoding, with a wealth of features and user-selectable options.
-
-# Compiling #
-
-The code employs the *cmake* tool to generate a variety of build environments.  A visual studio code container is included for building using
-the visual studio code remote containers add in (highly recommended)
-
-## For Linux ##
-
-    cd build
-    cmake -DCMAKE_BUILD_TYPE=Release  ../
-    make
-
-The generated library and executables will be in the bin folder.
-
-## For Windows ##
-
-    cd build
-    cmake ../ -G "Visual Studio 14 2015 Win64"
-
-cmake support other visual studio versions.  This command generates a solution in the build folder, which can be build using visual studio.
-
-## For macOS ##
-
-You can use the "For Linux" approach above.  Alternatively, you can use the Xcode project in src/apps/apps.xcodeproj, which I use.  Another approach is to use cmake to generate an xcode project, in the build folder, using
-
-    cd build
-    cmake ../ -G Xcode
-    make
-
-The generated library and executables will be in the bin folder.
-
-# Compiling to javascript/wasm #
-
-The library can now be compiled to javascript/wasm.  For this purpose, a small wrapper file (ojph_wrapper.cpp) has been written to interface between javascript and C++; the wrapper currently supports decoding only.  A small demo page demonstrating the script can be accessed [here](https://openjph.org/javascript/demo.html).
-
-Compilation needs the [emscripten](https://emscripten.org/) tools. One way of using these tools is to install them on your machine, and activate them using
-
-    source emsdk_env.sh
-  
-before compilation.  Alternatively, if you are a docker user, the you can launch a docker session using script provided at ```subprojects/js/emscripten-docker.sh```; this script will download a third-party docker image that has the emscripten tools integrated in it -- Thanks to [Chris](https://github.com/chafey) for the suggesting and providing these tools.  
-
-The javascript decoder can be compiled using
-
-    cd subprojects/js/build
-    emcmake cmake ..
-    emmake make
-
-The compilation creates libopenjph.js and libopenjph.wasm in subprojects/js/html folder.  That html folder also has the demo webpage index.html and a compressed image test.j2c which the script in index.html decodes.  To run the demo webpage on your machine, you need a webserver running on the machine -- Due to security reasons, javascript engines running in a browser cannot access local files on the machine.  You can use the ```emrun``` command, provided with the emscripten
-tools, by issuing the command
-
-    emrun index.html
-
-from inside the html folder; the default port is 6931.
-Alternatively, a simple python webserver can be run using
-
-    python -m http.server 8000
-  
-also from inside the html folder.  Here, 8000 is the port number at which the webserver will be listening.  The webpage can then be accessed by open localhost:8000 in you browser.   Any browser supporting webassembly can be used to view this webpage; examples include Firefox, Chrome, Safari, and Edge, on a desktop, mobile, or tablet.
-
-# Visual Studio Code Remote Containers #
-
-Visual Studio Code Remote Containers are now available with OpenJPH.  These scripts/configuration files are provided by [Chris](https://github.com/chafey) -- Thank you Chris, and I must say I am not familiar with them.
-The scripts, in the ```.devcontainer``` folder, will build a docker image that can be used with visual studio code as a development environment.
-
-# Compiling for ARM and other platforms #
-
-To compile for platforms where x86_64 SIMD instructions are not supported, such as on ARM, we need to disable SIMD instructions; this can be achieved using
-
-    cd build
-    cmake -DCMAKE_BUILD_TYPE=Release -DOJPH_DISABLE_INTEL_SIMD=ON ../
-    make
-
-As I do not have an ARM board, I tested this using QEMU for aarch64 architecture, targeting a Cortex-A57 CPU. The code worked without issues, but because the ARM platform is emulated, the whole process was slow.
-
-# Compiling and Running in Docker #
-
-## Step 1 - clone repository   
-`https://github.com/aous72/OpenJPH.git`
-
-## Step 2 - build docker image  
-`cd OpenJPH`   
-`docker build --rm -f Dockerfile -t openjph:latest .`
-
-## Step 3 - run docker image
-
-### in isolated container   
-`docker run -it --rm openjph:latest`
-
-### mapping /usr/src/openjph/build directory in the container to local windows c:\temp
-`docker run -it --rm -v C:\\temp:/usr/src/openjph/build openjph:latest`
-
-# Usage Example #
-
-Here are some usage examples:
-
-    ojph_compress -i input_file.ppm -o output_file.j2c -num_decomps 5 -block_size {64,64} -precincts {128,128},{256,256} -prog_order CPRL -colour_trans true -qstep 0.05
-    ojph_compress -i input_file.yuv -o output_file.j2c -num_decomps 5 -reversible true -dims {3840,2160} -num_comps 3 -signed false -bit_depth 10 -downsamp {1,1},{2,2}
-
-    ojph_expand -i input_file.j2c -o output_file.ppm
-    ojph_expand -i input_file.j2c -o output_file.yuv
-
-**Notes**:
+# The standard #
 
-* Issuing ojph\_compress or ojph\_expand without arguments prints a short usage statement.
-* In reversible compression, quantization is not supported.
-* On Linux and MacOS, but NOT Windows, { and } need to be escaped; i.e, we need to write \\\{ and \\\}.  So, -block\_size {64,64} must be written as -block\_size \\\{64,64\\\}.
-* When the source is a .yuv file, use -downsamp {1,1} for 4:4:4 sources. For 4:2:2 downsampling, specify -downsamp {1,1},{2,1}, and for 4:2:0 subsampling specify -downsamp {1,1},{2,2}. The source must have already been downsampled (i.e., OpenJPH does not downsample the source before compression, but can compress downsampled sources).
-* In Kakadu, pairs of data in command line arguments represent columns,rows. Here, a pair represents x,y information.
+The standard is available free of charge from [ITU website](https://www.itu.int/rec/T-REC-T.814/en). It can also be purchased from the [ISO website](https://www.iso.org/standard/76621.html).
 
-# The standard #
+# Table of Contents #
 
-The standard is available free of charge from [ITU website](https://www.itu.int/rec/T-REC-T.814/en). It can also be purchased from the [ISO website](https://www.iso.org/standard/76621.html). 
+* [Status](/docs/status.md)
+* [Compiling](./docs/compiling.md)
+* [Compiling and Running in Docker](./docs/docker.md)
+* [Usage Example](./docs/usage_examples.md)
+* [Web-based Demos](./docs/web_demos.md)
+* [Doxygen Documentation Style](./docs/doxygen_style.md)
+* [OSS-Fuzzing](./docs/fuzzing.md)
 
+# Repositories #
+[![Packaging status](https://repology.org/badge/vertical-allrepos/openjph.svg)](https://repology.org/project/openjph/versions)
diff --git a/UPSTREAM_PR_DRAFT.md b/UPSTREAM_PR_DRAFT.md
new file mode 100644
index 00000000..174ee6ce
--- /dev/null
+++ b/UPSTREAM_PR_DRAFT.md
@@ -0,0 +1,41 @@
+## Title
+Fix truncated-codestream exception handling in tile header parsing
+
+## Summary
+- Fix exception handling in `tile::parse_tile_header` to catch `std::exception` (and unknown exceptions) instead of only `const char*`.
+- Preserve resilience behavior by logging in resilient mode and only re-raising through `OJPH_ERROR` in non-resilient mode.
+- Add a small standalone demo script (`subprojects/js/standalone/truncated_decode_demo.sh`) to reproduce truncated codestream decoding behavior and confirm graceful process termination.
+
+## Problem
+When decoding truncated codestreams, OpenJPH may throw `std::runtime_error` via `OJPH_ERROR`.  
+`tile::parse_tile_header` currently catches only `const char*`, so `std::runtime_error` bypasses this catch and may terminate the process unexpectedly in downstream WASM/node integrations.
+
+## Root Cause
+Mismatch between thrown exception type and caught exception type:
+- Throw site: `OJPH_ERROR` -> `std::runtime_error`
+- Catch site in `ojph_tile.cpp`: `catch (const char*)`
+
+## Fix Details
+In `src/core/codestream/ojph_tile.cpp`:
+- Replace:
+  - `catch (const char* error)`
+- With:
+  - `catch (const std::exception& error)`
+  - `catch (...)`
+
+## Why This Is Safe
+- No behavior change on successful decodes.
+- In resilient mode, errors continue to be reported via `OJPH_INFO`.
+- In non-resilient mode, failures still propagate as errors through `OJPH_ERROR`.
+
+## Repro / Validation
+1. Start from a valid `.j2c`.
+2. Truncate to first 10 KiB.
+3. Decode with `ojph_expand`.
+4. Verify process exits normally (possibly non-zero), instead of aborting via uncaught exception.
+
+Standalone script:
+- `subprojects/js/standalone/truncated_decode_demo.sh`
+
+## Notes
+Downstream WASM wrappers may also require exception-catching support at link time to avoid runtime aborts when exceptions are thrown across wrapper boundaries.
diff --git a/bin/.gitignore b/bin/.gitignore
deleted file mode 100644
index 7d1d8c52..00000000
--- a/bin/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-.DS_Store
-*
diff --git a/docs/compiling.md b/docs/compiling.md
new file mode 100644
index 00000000..a5490e6f
--- /dev/null
+++ b/docs/compiling.md
@@ -0,0 +1,116 @@
+# Compiling #
+
+The code employs the *cmake* tool to generate a variety of build environments.  A visual studio code container is included for building using
+the visual studio code remote containers add in (highly recommended)
+
+## For Linux ##
+
+You may need to install libtiff; then,
+
+    cd build
+    cmake -DCMAKE_BUILD_TYPE=Release  ../
+    make 
+    sudo make install
+
+## For Windows ##
+
+Compilation depends on libtiff. A pre-compiled library with all the library features for Windows is not available; I am using [this](https://github.com/aous72/OpenJPH/files/14060335/tiff.zip), but I think I have only the basic library. 
+
+    cd build
+    cmake .. -G "Visual Studio 17 2022 Win64" -DCMAKE_PREFIX_PATH=<tiff library path>
+
+`cmake` supports other visual studio versions.  This command generates a solution in the build folder, which can be build using visual studio.
+
+To compile from the command line, use
+
+    cmake --build . --config Release
+
+To install either use 
+
+    cmake --install . --prefix <your folder>
+
+to install the library to your desired folder, or, if you want to install to C:\Program Files, you need a PowerShell/CMD running as administrator, and 
+
+    cmake --install .
+
+
+## For macOS ##
+
+You can use the "For Linux" approach above.  Alternatively, you can use the Xcode project in src/apps/apps.xcodeproj, which I use.  Another approach is to use cmake to generate an xcode project, in the build folder, using
+
+    cd build
+    cmake ../ -G Xcode
+    make
+    sudo make install
+
+I have not tested this in a long time, but you get the picture.
+
+## Building Tests ##
+
+When you invoke `cmake` add `-DOJPH_BUILD_TESTS=ON`, then, for Windows
+
+    cd tests
+    ctest -C Release
+
+For other platforms 
+
+    cd tests
+    ctest
+
+The test setup is a bit finicky, and may sometimes fail for silly reasons.
+
+# Compiling to Node.js #
+
+The library can be compiled to run with Node.js.  Compilation needs the [emscripten](https://emscripten.org/) tools. One way of using these tools is to install them on your machine, and activate them using, assuming running on platform other than Windows,
+
+    source emsdk_env.sh
+  
+before compilation.  Then, 
+    emcmake cmake ..
+    emmake make
+
+Compilation will generate two version of the library and executables, one with WebAssembly SIMD instructions and one without.
+
+
+# Compiling to javascript/wasm #
+
+The library can now be compiled to javascript/wasm.  For this purpose, a small wrapper file (ojph_wrapper.cpp) has been written to interface between javascript and C++; the wrapper currently supports decoding only.  A small demo page demonstrating the script can be accessed [here](https://openjph.org/javascript/demo.html).
+
+Compilation needs the [emscripten](https://emscripten.org/) tools. One way of using these tools is to install them on your machine, and activate them using
+
+    source emsdk_env.sh
+  
+before compilation.  Alternatively, if you are a docker user, the you can launch a docker session using script provided at ```subprojects/js/emscripten-docker.sh```; this script will download a third-party docker image that has the emscripten tools integrated in it -- Thanks to [Chris](https://github.com/chafey) for the suggesting and providing these tools.  
+
+The javascript decoder can be compiled using
+
+    cd subprojects/js/build
+    emcmake cmake ..
+    emmake make
+
+The compilation creates libopenjph.js and libopenjph.wasm in subprojects/js/html folder; it also creates libopenjphsimd.js and libopenjphsimd.wasm.  That html folder also has the demo webpage index.html and a compressed image test.j2c which the script in index.html decodes. The index.html detects if the browser supports WebAssembly SIMD instructions, and loads the correct library accordingly.  
+
+To run the demo webpage on your machine, you need a webserver running on the machine -- Due to security reasons, javascript engines running in a browser cannot access local files on the machine.  You can use the ```emrun``` command, provided with the emscripten
+tools, by issuing the command
+
+    emrun index.html
+
+from inside the html folder; the default port is 6931.
+Alternatively, a simple python webserver can be run using
+
+    python -m http.server 8000
+  
+also from inside the html folder.  Here, 8000 is the port number at which the webserver will be listening.  The webpage can then be accessed by open localhost:8000 in you browser.   Any browser supporting webassembly can be used to view this webpage; examples include Firefox, Chrome, Safari, and Edge, on a desktop, mobile, or tablet.
+
+# Visual Studio Code Remote Containers #
+
+Visual Studio Code Remote Containers are now available with OpenJPH.  These scripts/configuration files are provided by [Chris](https://github.com/chafey) -- Thank you Chris, and I must say I am not familiar with them.
+The scripts, in the ```.devcontainer``` folder, will build a docker image that can be used with visual studio code as a development environment.
+
+# Compiling for ARM and other platforms #
+
+Compilation should simply work now.  The simple test code I have passes when run on MacOS ARM on GitHub.
+
+# Disabling SIMD instructions #
+
+The code now employs the architecture-agnostic option `OJPH_DISABLE_SIMD`, which should include SIMD instructions wherever they are supported.  This can be achieved with `-DOJPH_DISABLE_SIMD=ON` option during CMake configuration.  Individual instruction sets can be disabled; see the options in the main CMakeLists.txt file.
diff --git a/docs/docker.md b/docs/docker.md
new file mode 100644
index 00000000..7e03bfb1
--- /dev/null
+++ b/docs/docker.md
@@ -0,0 +1,16 @@
+# Compiling and Running in Docker #
+
+## Step 1 - clone repository   
+`https://github.com/aous72/OpenJPH.git`
+
+## Step 2 - build docker image  
+`cd OpenJPH`   
+`docker build --rm -f Dockerfile -t openjph:latest .`
+
+## Step 3 - run docker image
+
+### in isolated container   
+`docker run -it --rm openjph:latest`
+
+### mapping /usr/src/openjph/build directory in the container to local windows c:\temp
+`docker run -it --rm -v C:\\temp:/usr/src/openjph/build openjph:latest`
diff --git a/docs/DoxygenStyle.md b/docs/doxygen_style.md
similarity index 86%
rename from docs/DoxygenStyle.md
rename to docs/doxygen_style.md
index d9c27e80..032ae35c 100644
--- a/docs/DoxygenStyle.md
+++ b/docs/doxygen_style.md
@@ -2,7 +2,7 @@
 
 Documentation is still experimental for me, and I might change things down the line.
 
-Here, we describe how to document the source code. This represent so sort of minial set of markers that need to be used. Other markers can be used to enhance the documentation of the code. This serves as a live document that can be updated when needed.
+Here, we describe how to document the source code. This represent so sort of minimal set of markers that need to be used. Other markers can be used to enhance the documentation of the code. This serves as a live document that can be updated when needed.
 I am preferencing `@` over `\`.
 
 ## Comment block
diff --git a/docs/fuzzing.md b/docs/fuzzing.md
new file mode 100644
index 00000000..86855ad0
--- /dev/null
+++ b/docs/fuzzing.md
@@ -0,0 +1,11 @@
+# Fuzzer Target #
+
+Fuzzer targets intended for use with [oss-fuzz](https://oss-fuzz.com/) can be build using the `OJPH_BUILD_FUZZER` build option.
+
+The targets can be run locally as follows:
+
+```sh
+cd build
+cmake .. -DOJPH_BUILD_FUZZER=ON
+./fuzzing/ojph_expand_fuzz_target <test case>
+```
diff --git a/docs/status.md b/docs/status.md
new file mode 100644
index 00000000..a6fe07a0
--- /dev/null
+++ b/docs/status.md
@@ -0,0 +1,9 @@
+# Status #
+
+The code is written in C++; the color and wavelet transform steps can employ SIMD instructions on Intel platforms.  SIMD instructions are also available for the block decoder (SSE3) and for the block encoder (AVX512). Other parts of the library may include SIMD in the future, for Intel and ARM; existing implementations can also be improved as there is still decent performance improvements on the table. SIMD instructions are also employed for WebAssembly (Emscripten-based), which is now widely supported in most browsers.
+
+The encoder supports lossless and quantization-based lossy encoding.  There is currently no implementation for rate-control-based encoding.
+
+As it stands, the OpenJPH library needs documentation. The provided encoder ojph\_compress only generates HTJ2K codestreams, with the extension j2c; the generated files lack the .jph header.  Adding the .jph header is of little urgency, as the codestream contains all needed information to properly decode an image.  The .jph header will be added at a future point in time.  The provided decoder ojph\_expand decodes .jph files, by ignoring the .jph header if it is present.
+
+The provided command line tools ojph\_compress and ojph\_expand accepts and generates .pgm, .ppm, .yuv, .raw, and .dpx. See the usage examples below.
\ No newline at end of file
diff --git a/docs/usage_examples.md b/docs/usage_examples.md
new file mode 100644
index 00000000..1693baac
--- /dev/null
+++ b/docs/usage_examples.md
@@ -0,0 +1,20 @@
+# Usage Example #
+
+Here are some usage examples:
+
+    ojph_compress -i input_file.ppm -o output_file.j2c -num_decomps 5 -block_size {64,64} -precincts {128,128},{256,256} -prog_order CPRL -colour_trans true -qstep 0.05
+    
+    ojph_compress -i input_file.yuv -o output_file.j2c -num_decomps 5 -reversible true -dims {3840,2160} -num_comps 3 -signed false -bit_depth 10 -downsamp {1,1},{2,2}
+
+    ojph_expand -i input_file.j2c -o output_file.ppm
+    ojph_expand -i input_file.j2c -o output_file.yuv
+
+**Notes**:
+
+* Issuing ojph\_compress or ojph\_expand without arguments prints a short usage statement.
+* In reversible compression, quantization is not supported.
+* On Linux and MacOS, but NOT Windows, { and } need to be escaped; i.e, we need to write \\\{ and \\\}.  So, -block\_size {64,64} must be written as -block\_size \\\{64,64\\\}.
+* When the source is a .yuv file, use -downsamp {1,1} for 4:4:4 sources. For 4:2:2 downsampling, specify -downsamp {1,1},{2,1}, and for 4:2:0 subsampling specify -downsamp {1,1},{2,2}. The source must have already been downsampled (i.e., OpenJPH does not downsample the source before compression, but can compress downsampled sources).
+* In Kakadu, pairs of data in command line arguments represent columns,rows. Here, a pair represents x,y information.
+* It came to my realization (See https://github.com/aous72/OpenJPH/issues/187) that there is an issue with files with `.raw` extension.  Kakadu and OpenJPEG use `.raw` for big-endian data and `.rawl` for little-endian data -- This is only meaningful for data samples that are more than 1 byte.  OpenJPH uses `.raw` for little-endian and there is no support for big-endian.  I need to transition to the convention adopted by Kakadu and OpenJPEG; the plan to is to support `.rawl` first, and warning that `.raw` is currently little-endian, but the plan is to move to big-endian.  Then, at a future point, the warning for `.raw` becomes that it is for big-endian. Then after a while this warning can be removed.
+
diff --git a/docs/web_demos.md b/docs/web_demos.md
new file mode 100644
index 00000000..f46590b9
--- /dev/null
+++ b/docs/web_demos.md
@@ -0,0 +1,5 @@
+# Web-based Demos #
+
+The associate site [openjph.org](https://openjph.org) serves as a blog.  It currently host the [javascript](https://openjph.org/javascript/demo.html) demo of the decoder; the webpage demonstrates that the library can be compiled to javascript, and can run inside a web-browser.  Any browser supporting webassembly can be used to view this webpage; examples include Firefox, Chrome, Safari, and Edge, on a desktop, mobile, or tablet.
+
+Another project of interest is the [openjphjs](https://github.com/chafey/openjphjs) project, developed by [Chris](https://github.com/chafey).  You can see [there](https://chafey.github.io/openjphjs/test/browser/index.html) a nice online demonstration of javascript-based HTJ2K encoding/decoding, with a wealth of features and user-selectable options.
diff --git a/fuzzing/CMakeLists.txt b/fuzzing/CMakeLists.txt
new file mode 100644
index 00000000..6a29ec22
--- /dev/null
+++ b/fuzzing/CMakeLists.txt
@@ -0,0 +1,14 @@
+# detect whether we are within the oss fuzz environment
+if(DEFINED ENV{LIB_FUZZING_ENGINE})
+  # if yes, we can link against the main function of the fuzzing engine
+  link_libraries($ENV{LIB_FUZZING_ENGINE})
+else()
+  # if not, we need to define our main function
+  add_compile_definitions(OJPH_FUZZ_TARGET_MAIN)
+endif()
+
+add_executable(ojph_expand_fuzz_target fuzz_targets/ojph_expand_fuzz_target.cpp)
+target_link_libraries(ojph_expand_fuzz_target PRIVATE openjph)
+
+add_executable(ojph_compress_fuzz_target fuzz_targets/ojph_compress_fuzz_target.cpp)
+target_link_libraries(ojph_compress_fuzz_target PRIVATE openjph)
diff --git a/fuzzing/fuzz_targets/ojph_compress_fuzz_target.cpp b/fuzzing/fuzz_targets/ojph_compress_fuzz_target.cpp
new file mode 100644
index 00000000..8deefc56
--- /dev/null
+++ b/fuzzing/fuzz_targets/ojph_compress_fuzz_target.cpp
@@ -0,0 +1,131 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman
+// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_compress_fuzz_target.cpp
+// Fuzz target for the HTJ2K encoding (compression) path.
+//***************************************************************************/
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+
+#include "ojph_mem.h"
+#include "ojph_file.h"
+#include "ojph_codestream.h"
+#include "ojph_params.h"
+
+// Input layout (4 control bytes + pixel data):
+//   byte 0: [6:0] width-1  (1..128)
+//   byte 1: [6:0] height-1 (1..128)
+//   byte 2: [1:0] num_components-1 (1..4)
+//           [3:2] bit_depth selector (8,10,12,16)
+//           [4]   is_signed
+//           [5]   reversible
+//           [6]   color_transform
+//   byte 3: [2:0] num_decompositions (0..5, clamped)
+//           [3]   planar
+//   bytes 4+: pixel data (each byte becomes one sample)
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size)
+{
+  if (Size < 5)
+    return 0;
+
+  ojph::ui32 width     = (Data[0] & 0x7F) + 1;
+  ojph::ui32 height    = (Data[1] & 0x7F) + 1;
+  ojph::ui32 num_comps = (Data[2] & 0x03) + 1;
+  ojph::ui32 bit_depth = (ojph::ui32[]){8, 10, 12, 16}[(Data[2] >> 2) & 0x03];
+  bool is_signed       = (Data[2] >> 4) & 1;
+  bool reversible      = (Data[2] >> 5) & 1;
+  bool color_transform = (Data[2] >> 6) & 1;
+  ojph::ui32 num_decomps = Data[3] & 0x07;
+  bool planar          = (Data[3] >> 3) & 1;
+
+  if (num_decomps > 5) num_decomps = 5;
+  if (num_comps < 3)   color_transform = false;
+  if (color_transform) planar = false;
+
+  const uint8_t *pixels = Data + 4;
+  size_t pixels_len = Size - 4;
+  size_t pix_idx = 0;
+
+  try
+  {
+    ojph::codestream cs;
+
+    ojph::param_siz siz = cs.access_siz();
+    siz.set_image_extent(ojph::point(width, height));
+    siz.set_num_components(num_comps);
+    for (ojph::ui32 c = 0; c < num_comps; ++c)
+      siz.set_component(c, ojph::point(1, 1), bit_depth, is_signed);
+
+    ojph::param_cod cod = cs.access_cod();
+    cod.set_num_decomposition(num_decomps);
+    cod.set_color_transform(color_transform);
+    cod.set_reversible(reversible);
+
+    if (!reversible)
+      cs.access_qcd().set_irrev_quant(0.0005f);
+
+    cs.set_planar(planar);
+
+    ojph::mem_outfile outfile;
+    outfile.open();
+    cs.write_headers(&outfile);
+
+    // Total rows to push: planar processes each component fully,
+    // interleaved processes one row from all components at a time.
+    ojph::ui32 total_rows = num_comps * height;
+    ojph::ui32 next_comp;
+    ojph::line_buf *line = cs.exchange(NULL, next_comp);
+
+    for (ojph::ui32 r = 0; r < total_rows; ++r)
+    {
+      ojph::si32 *dp = line->i32;
+      for (ojph::ui32 x = 0; x < width; ++x)
+      {
+        // Use fuzz bytes as sample values, wrapping around as needed
+        ojph::si32 val = (ojph::si32)pixels[pix_idx % pixels_len];
+        pix_idx++;
+        dp[x] = is_signed ? val - 128 : val;
+      }
+      line = cs.exchange(line, next_comp);
+    }
+
+    cs.flush();
+    cs.close();
+  }
+  catch (const std::exception &)
+  {
+  }
+  return 0;
+}
diff --git a/fuzzing/fuzz_targets/ojph_expand_fuzz_target.cpp b/fuzzing/fuzz_targets/ojph_expand_fuzz_target.cpp
new file mode 100644
index 00000000..48514bff
--- /dev/null
+++ b/fuzzing/fuzz_targets/ojph_expand_fuzz_target.cpp
@@ -0,0 +1,218 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman 
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_expand_fuzz_target.cpp
+// Author: Pierre-Anthony Lemieux
+// Date: 17 February 2026
+//***************************************************************************/
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <vector>
+
+#include <ojph_arch.h>
+#include <ojph_file.h>
+#include <ojph_params.h>
+#include <ojph_mem.h>
+#include <ojph_codestream.h>
+#include <ojph_message.h>
+#include <exception>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size)
+{
+  // The first 2 bytes are used to control decoder options:
+  //   byte 0 bit 1: force planar mode
+  //   byte 0 bit 2: force interleaved mode
+  //   byte 1: number of resolutions to skip (0-7)
+  if (Size < 3)
+    return 0;
+
+  uint8_t opts = Data[0];
+  uint8_t skip_res = Data[1] & 0x07;
+  Data += 2;
+  Size -= 2;
+
+  bool force_planar = (opts & 0x02) != 0;
+  bool force_interleaved = (opts & 0x04) != 0;
+
+  try
+  {
+    ojph::mem_infile infile;
+    infile.open(reinterpret_cast<const ojph::ui8 *>(Data), Size);
+
+    ojph::codestream cs;
+
+    // Always enable resilience: all fuzzer inputs are untrusted/mutated,
+    // so the decoder must use its error-recovery path.
+    cs.enable_resilience();
+
+    cs.read_headers(&infile);
+
+    // Guard against inputs that cause excessive decoding work.
+    {
+      ojph::param_siz siz = cs.access_siz();
+      ojph::point extent = siz.get_image_extent();
+      ojph::point offset = siz.get_image_offset();
+      ojph::ui64 w = extent.x - offset.x;
+      ojph::ui64 h = extent.y - offset.y;
+      if (w * h * siz.get_num_components() > 65536)
+      {
+        cs.close();
+        return 0;
+      }
+
+      ojph::param_cod cod = cs.access_cod();
+      if (cod.get_num_decompositions() > 5)
+      {
+        cs.close();
+        return 0;
+      }
+
+      // Large precincts cause huge internal buffers and very expensive
+      // per-row wavelet transforms even for small images.
+      for (ojph::ui32 lev = 0; lev <= cod.get_num_decompositions(); ++lev)
+      {
+        ojph::size psiz = cod.get_precinct_size(lev);
+        if (psiz.w > 256 || psiz.h > 256)
+        {
+          cs.close();
+          return 0;
+        }
+      }
+    }
+
+    if (skip_res > 0)
+      cs.restrict_input_resolution(skip_res, skip_res);
+
+    if (force_planar)
+      cs.set_planar(true);
+    else if (force_interleaved)
+      cs.set_planar(false);
+
+    cs.create();
+
+    ojph::param_siz siz = cs.access_siz();
+
+    // Second guard: cap reconstructed dimensions after create().
+    {
+      ojph::ui64 total_recon = 0;
+      for (ojph::ui32 c = 0; c < siz.get_num_components(); ++c)
+        total_recon += (ojph::ui64)siz.get_recon_width(c)
+                     * (ojph::ui64)siz.get_recon_height(c);
+      if (total_recon > 65536)
+      {
+        cs.close();
+        return 0;
+      }
+    }
+
+    // Time budget: abort if decoding takes too long.
+    struct timespec start_ts;
+    clock_gettime(CLOCK_MONOTONIC, &start_ts);
+    ojph::ui32 pull_count = 0;
+    const ojph::ui32 MAX_SECONDS = 10;
+    bool timed_out = false;
+
+    if (cs.is_planar())
+    {
+      for (ojph::ui32 c = 0; c < siz.get_num_components() && !timed_out; ++c)
+      {
+        ojph::ui32 height = siz.get_recon_height(c);
+        for (ojph::ui32 i = height; i > 0 && !timed_out; --i)
+        {
+          ojph::ui32 comp_num;
+          cs.pull(comp_num);
+          if (++pull_count % 64 == 0)
+          {
+            struct timespec now;
+            clock_gettime(CLOCK_MONOTONIC, &now);
+            if ((ojph::ui32)(now.tv_sec - start_ts.tv_sec) >= MAX_SECONDS)
+              timed_out = true;
+          }
+        }
+      }
+    }
+    else
+    {
+      ojph::ui32 height = siz.get_recon_height(0);
+      for (ojph::ui32 i = 0; i < height && !timed_out; ++i)
+      {
+        for (ojph::ui32 c = 0; c < siz.get_num_components(); ++c)
+        {
+          ojph::ui32 comp_num;
+          cs.pull(comp_num);
+          if (++pull_count % 64 == 0)
+          {
+            struct timespec now;
+            clock_gettime(CLOCK_MONOTONIC, &now);
+            if ((ojph::ui32)(now.tv_sec - start_ts.tv_sec) >= MAX_SECONDS)
+              timed_out = true;
+          }
+        }
+      }
+    }
+
+    cs.close();
+  }
+  catch (const std::exception &)
+  {
+  }
+
+  return 0;
+}
+
+#ifdef OJPH_FUZZ_TARGET_MAIN
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    return -1;
+  }
+  FILE *f = fopen(argv[1], "rb");
+  if (!f) { return -1; }
+  fseek(f, 0, SEEK_END);
+  long len = ftell(f);
+  if (len < 0) {
+    return -1;
+  }
+  rewind(f);
+  // Prepend 2 control bytes (default: no skip)
+  std::vector<uint8_t> buf(len + 2);
+  buf[0] = 0;
+  buf[1] = 0;
+  size_t n = fread(buf.data() + 2, 1, len, f);
+  if(n != static_cast<size_t>(len)) {
+    return -1;
+  }
+  fclose(f);
+  LLVMFuzzerTestOneInput(buf.data(), buf.size());
+  return 0;
+}
+#endif
diff --git a/ojph_version.cmake b/ojph_version.cmake
new file mode 100644
index 00000000..824f02bd
--- /dev/null
+++ b/ojph_version.cmake
@@ -0,0 +1,29 @@
+################################################################################################
+# Generating ojph library version number
+################################################################################################
+
+############################################################
+# Parse version file
+# credit: https://stackoverflow.com/a/47084079
+
+file(READ "${CMAKE_CURRENT_SOURCE_DIR}/src/core/openjph/ojph_version.h" VERFILE)
+if (NOT VERFILE)
+    message(FATAL_ERROR "Failed to parse ojph_version.h!")
+endif()
+
+string(REGEX MATCH "OPENJPH_VERSION_MAJOR ([0-9]*)" _ ${VERFILE})
+set(OPENJPH_VERSION_MAJOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "OPENJPH_VERSION_MINOR ([0-9]*)" _ ${VERFILE})
+set(OPENJPH_VERSION_MINOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "OPENJPH_VERSION_PATCH ([0-9]*)" _ ${VERFILE})
+set(OPENJPH_VERSION_PATCH ${CMAKE_MATCH_1})
+
+set(OPENJPH_VERSION "${OPENJPH_VERSION_MAJOR}.${OPENJPH_VERSION_MINOR}.${OPENJPH_VERSION_PATCH}")
+############################################################
+
+message(STATUS "OpenJPH library version: ${OPENJPH_VERSION}")
+
+if (OPENJPH_VERSION)
+else()
+  message(FATAL_ERROR "OPENJPH_VERSION is not set")
+endif()
diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt
new file mode 100644
index 00000000..b5737f62
--- /dev/null
+++ b/src/apps/CMakeLists.txt
@@ -0,0 +1,29 @@
+
+# Add tiff library 
+############################################################
+if( OJPH_ENABLE_TIFF_SUPPORT AND (NOT EMSCRIPTEN))
+
+  FIND_PACKAGE( TIFF )
+
+  if( TIFF_FOUND )
+    set(USE_TIFF TRUE CACHE BOOL "Add TIFF support")
+    add_definitions(-DOJPH_ENABLE_TIFF_SUPPORT)
+  else()
+    message(WARNING "TIFF support has been requested but no path to the TIFF library "
+    "has been specified; please configure with -DCMAKE_PREFIX_PATH=<TIFF library directory>, "
+    "or disable TIFF support using -DOJPH_ENABLE_TIFF_SUPPORT=OFF.")
+  endif( TIFF_FOUND )
+
+endif() 
+############################################################
+
+if (EMSCRIPTEN)
+  add_link_options(-sWASM=1 -sASSERTIONS=1 -sALLOW_MEMORY_GROWTH=1 -sNODERAWFS=1 -sENVIRONMENT=node -sEXIT_RUNTIME=1 -sEXCEPTION_CATCHING_ALLOWED=['fake'])
+endif()
+
+## Build executables
+add_subdirectory(ojph_expand)
+add_subdirectory(ojph_compress)
+if (OJPH_BUILD_STREAM_EXPAND)
+  add_subdirectory(ojph_stream_expand)
+endif()
\ No newline at end of file
diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h
index 8e41493d..c18ee76e 100644
--- a/src/apps/common/ojph_img_io.h
+++ b/src/apps/common/ojph_img_io.h
@@ -54,7 +54,7 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   // defined elsewhere
   class mem_fixed_allocator;
-  struct line_buf;
+  class line_buf;
 
   ////////////////////////////////////////////////////////////////////////////
   //
@@ -135,7 +135,7 @@ namespace ojph {
 
     ui32 cur_line;
     si64 start_of_data;
-    int planar;
+    bool planar;
     ui32 bit_depth[3];
     bool is_signed[3];
     point subsampling[3];
@@ -446,6 +446,68 @@ namespace ojph {
     size_t buffer_size;
   };
 
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+  class pfm_in : public image_in_base
+  {
+  public:
+    pfm_in(mem_fixed_allocator *p = NULL)
+    {
+      fh = 0;
+      fname = NULL;
+      alloc_p = p;
+      temp_buf = NULL;
+      temp_buf_byte_size = 0;
+      bit_depth[0] = bit_depth[1] = bit_depth[2] = 32;
+      scale = 0.0f;
+      little_endian = true;
+      width = height = num_comps = 0;
+
+      cur_line = 0;
+      start_of_data = 0;
+    }
+    virtual ~pfm_in()
+    {
+      close();
+      if (alloc_p == NULL && temp_buf)
+        free(temp_buf);
+    }
+
+    void open(const char* filename);
+    void finalize_alloc();
+    void configure(ui32* bit_depth) {
+      assert(num_comps != 0);
+      for (ui32 c = 0; c < num_comps; ++c)
+        this->bit_depth[c] = bit_depth[c];
+    }
+    virtual ui32 read(const line_buf* line, ui32 comp_num);
+    void close() { if(fh) { fclose(fh); fh = NULL; } fname = NULL; }
+
+    size get_size() { assert(fh); return size(width, height); }
+    ui32 get_width() { assert(fh); return width; }
+    ui32 get_height() { assert(fh); return height; }
+    ui32 get_num_components() { assert(fh); return num_comps; }
+
+  private:
+    FILE *fh;
+    const char *fname;
+    mem_fixed_allocator *alloc_p;
+    float *temp_buf;
+    size_t temp_buf_byte_size;
+    ui32 bit_depth[3];       // this truncates data to bit_depth in the LSB
+    float scale;
+    bool little_endian;
+    ui32 width, height, num_comps;
+    ui32 cur_line;
+    si64 start_of_data;
+  };
+
+
   ////////////////////////////////////////////////////////////////////////////
   // Accelerators (defined in ojph_img_io_*)
   typedef void (*conversion_fun)(const line_buf *ln0, const line_buf *ln1, 
@@ -559,7 +621,7 @@ namespace ojph {
     ui32 width, height, num_components;
     ui32 bit_depth, bytes_per_sample;
     ui8* buffer;
-    ui32 buffer_size;
+    size_t buffer_size;
     ui32 cur_line, samples_per_line, bytes_per_line;
     conversion_fun converter;
     const line_buf *lptr[3];
@@ -621,7 +683,7 @@ namespace ojph {
     ui32 bit_depth_of_data[4]; 
     ui32 bytes_per_sample;
     ui8* buffer;
-    ui32 buffer_size;
+    size_t buffer_size;
     ui32 cur_line, samples_per_line;
   };
 #endif /* OJPH_ENABLE_TIFF_SUPPORT */
@@ -698,11 +760,60 @@ namespace ojph {
     const char* fname;
     bool is_signed;
     ui32 bit_depth, bytes_per_sample;
-    si32 lower_val, upper_val;
+    si64 lower_val, upper_val;
     ui32 width;
     ui8* buffer;
     ui32 buffer_size;
   };
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+  class pfm_out : public image_out_base
+  {
+  public:
+    pfm_out()
+    {
+      fh = NULL;
+      fname = NULL;
+      buffer = NULL;
+      buffer_size = 0;
+      width = height = num_components = 0;
+      scale = -1.0f;
+      bit_depth[0] = bit_depth[1] = bit_depth[2] = 32;
+      cur_line = 0;
+      start_of_data = 0;
+    }
+    virtual ~pfm_out()
+    {
+      close();
+      if (buffer)
+        free(buffer);
+    }
+
+    void open(char* filename);
+    void configure(ui32 width, ui32 height, ui32 num_components, 
+                   float scale, ui32* bit_depth);
+    virtual ui32 write(const line_buf* line, ui32 comp_num);
+    virtual void close() { if(fh) { fclose(fh); fh = NULL; } fname = NULL; }
+
+  private:
+    FILE *fh;
+    const char *fname;
+    float* buffer;
+    size_t buffer_size;
+    ui32 width, height, num_components;
+    float scale;
+    ui32 bit_depth[3];
+    ui32 cur_line;
+    si64 start_of_data;
+  };
+
+
 }
 
 #endif // !OJPH_IMG_IO_H
diff --git a/src/apps/common/ojph_sockets.h b/src/apps/common/ojph_sockets.h
new file mode 100644
index 00000000..ba62cd3d
--- /dev/null
+++ b/src/apps/common/ojph_sockets.h
@@ -0,0 +1,236 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2024, Aous Naman
+// Copyright (c) 2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2024, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_socket.h
+// Author: Aous Naman
+// Date: 17 April 2024
+//***************************************************************************/
+
+#ifndef OJPH_SOCKET_H
+#define OJPH_SOCKET_H
+
+#include <string>
+#include "ojph_arch.h"
+
+#ifdef OJPH_OS_WINDOWS
+	#include <winsock2.h>
+	#include <WS2tcpip.h>
+
+  typedef SOCKET ojph_socket;
+  #define OJPH_INVALID_SOCKET (INVALID_SOCKET)
+  #define OJPH_EWOULDBLOCK (WSAEWOULDBLOCK)
+#else
+	#include <sys/types.h>
+	#include <sys/socket.h>
+	#include <netinet/in.h>
+	#include <arpa/inet.h>
+	#include <netdb.h>
+	#include <unistd.h>
+	#include <errno.h>
+  #include <fcntl.h>
+
+  typedef int ojph_socket;
+  #define OJPH_INVALID_SOCKET (-1)
+  #define OJPH_EWOULDBLOCK (EWOULDBLOCK)
+#endif
+
+namespace ojph 
+{
+namespace net
+{
+
+///////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////
+
+//************************************************************************/
+/** @brief A small wrapper for socket that only abstract Winsock2
+  * 
+  *  This is a small wrapper that only abstracts the difference between 
+  *  Windows and Linux/MacOS socket implementations.
+  *  It does not not do much other than define a local member variable
+  *  of type int for Linux/OS and type SOCKET for Windows, which is 
+  *  unsigned int/int64.
+  */
+class socket {
+public:
+  /**
+    *  @brief default constructor
+    */
+  socket() { s = OJPH_INVALID_SOCKET; }
+
+  /**
+    *  @brief a copy constructor
+    */
+  socket(const ojph_socket& s);
+
+  /**
+    *  @brief Abstracts socket closing function
+    */
+  void close();
+
+  /**
+    *  @brief Sets the blocking mode
+    * 
+    *  @param  block sets to true to operate in blocking mode
+    *  @return returns true when the operation succeeds
+    */
+  bool set_blocking_mode(bool block);
+
+  /**
+    *  @brief provides access to the internal socket handle
+    * 
+    *  @return returns the internal socket handle
+    */
+  ojph_socket intern() { return s; }
+
+private:
+  ojph_socket s;  //!<int for Linux/MacOS and SOCKET for Windows
+};
+
+///////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////
+
+//************************************************************************/
+/** @brief A small wrapper for some Winsock2 functionality
+  * 
+  *  This is useful for windows, as it initializes and destroys 
+  *  WinSock2 library.
+  *  It keeps a count of how many times the constructor is called, 
+  *  reducing the count whenever the destructor is called.  When the 
+  *  count reaches zero, the library is destroyed -- Windows only.
+  * 
+  *  It also allows the creation of a socket, access to the last error 
+  *  in a portable way, and the translation of an error into a text 
+  *  message.
+  */
+class socket_manager {
+public:
+  /**
+    *  @brief default constructor
+    * 
+    *  This function initializes the Winsock2 stack in windows; it 
+    *  also increments the static member that keeps count of how many
+    *  times this object is used.
+    */
+  socket_manager();
+
+  /**
+    *  @brief default constructor
+    * 
+    *  This function cleans up the Winsock2 stack in windows when
+    *  the static member that keeps count of how many times this object 
+    *  is used reaches zero.
+    *
+    */
+  ~socket_manager();
+
+  /**
+    *  @brief Abstructs socket creation
+    *
+    *  This function takes the same parameters as the conventional 
+    *  socket() function
+    *
+    *  @param domain the same as in conventional socket() function
+    *  @param type the same as in conventional socket() function
+    *  @param protocol the same as in conventional socket() function
+    *  @return returns an abstraction of socket
+    * 
+    */
+  socket create_socket(int domain, int type, int protocol);
+
+  /**
+    *  @brief Abstructs get last error or errno
+    *
+    *  This function abstracts Windows GetLastError or Linux errno
+    * 
+    *  @return returns a number representing the error
+    *
+    */
+  int get_last_error();
+
+  /**
+    *  @brief Abstructs obtaining a textual message for an errnum
+    *
+    *  This function abstracts obtaining a textual message for an errnum
+    *
+    *  @param errnum the error number
+    *  @return a string holding a textual message for the error number
+    *
+    */
+  std::string get_error_message(int errnum);
+
+  /**
+    *  @brief Abstructs obtaining a textual message for GetLastError/errno
+    *
+    *  This function combines get_error_message() and get_last_error().
+    *  This function effectively calls get_last_error() and uses the 
+    *  returned error number to obtain a string by calling 
+    *  get_error_message(errnum).
+    *
+    *  @return a string holding a textual message for the error number
+    *
+    */
+  std::string get_last_error_message();
+
+  /**
+    *  @brief Abstractly obtains the 32-bit IPv4 address integer
+    *
+    *  This function obtains a 32-bit integer that represents the IPv4 
+    *  address in abstrct way (working both in Windows and Linux).
+    *  This is really an independent function, but it is convenient to 
+    *  put it here.
+    *
+    *  @return returns an integer holding IPv4 address
+    *
+    */
+  static ui32 get_addr(const sockaddr_in& addr);
+
+private:
+  static int ojph_socket_manager_counter;
+};
+
+} // !net namespace
+} // !ojph namespace
+
+
+
+#endif // !OJPH_SOCKET_H
\ No newline at end of file
diff --git a/src/apps/common/ojph_threads.h b/src/apps/common/ojph_threads.h
new file mode 100644
index 00000000..c70ffffe
--- /dev/null
+++ b/src/apps/common/ojph_threads.h
@@ -0,0 +1,155 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2024, Aous Naman
+// Copyright (c) 2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2024, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_threads.h
+// Author: Aous Naman
+// Date: 22 April 2024
+//***************************************************************************/
+
+#ifndef OJPH_THREADS_H
+#define OJPH_THREADS_H
+
+#include <atomic>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <deque>
+#include <condition_variable>
+
+namespace ojph
+{
+namespace thds
+{
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+/*****************************************************************************/
+/** @brief A base object for queuing tasks in the thread_pool
+ *  
+ *  Tasks run in the thread_pool must derive from this function and define
+ *  \"execute\".  Derived objects can include their own member variables.
+ * 
+ */
+class worker_thread_base
+{
+public:
+  /**
+   *  @brief virtual construction is a necessity to deconstruct derived 
+   *  objects.
+   */
+  virtual ~worker_thread_base() { }
+
+  /**
+   *  @brief Derived functions must define this function to execute its work
+   */
+  virtual void execute() = 0;
+};
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+/*****************************************************************************/
+/** 
+ *  @brief Implements a pool of threads, and can queue tasks.
+ *  
+ */
+class thread_pool
+{
+public:
+  /**
+   *  @brief default constructor
+   */
+  thread_pool() { stop.store(false, std::memory_order_relaxed); }
+  /**
+   *  @brief default destructor
+   */
+  ~thread_pool();
+
+public:
+  /**
+   *  @brief Initializes the thread pool
+   * 
+   *  @param num_threads the number of threads the thread pool holds
+   */
+  void init(size_t num_threads);
+
+  /**
+   *  @brief Adds a task to the thread pool
+   *
+   *  @param task the task to added, must be derived from worker_thread_base
+   */
+  void add_task(worker_thread_base* task);
+
+  /**
+   *  @brief Returns the number of threads in the thread pool
+   *
+   *  @retuen number of threads in the thread pool
+   */
+  size_t get_num_threads() { return threads.size(); }
+
+private:
+  /**
+   *  @brief A static function to start a thread
+   *
+   *  @param tp a pointer to the thread pool
+   */
+  static void start_thread(thread_pool* tp);
+
+private:
+  std::vector<std::thread> threads;
+  std::deque<worker_thread_base*> tasks;
+  std::mutex mutex;
+  std::condition_variable condition;
+  std::atomic_bool stop;
+};
+
+} // !thds namespace 
+} // !ojph namespace
+
+
+
+
+
+
+#endif // !OJPH_THREADS_H
\ No newline at end of file
diff --git a/src/apps/ojph_compress/CMakeLists.txt b/src/apps/ojph_compress/CMakeLists.txt
new file mode 100644
index 00000000..9c1166db
--- /dev/null
+++ b/src/apps/ojph_compress/CMakeLists.txt
@@ -0,0 +1,58 @@
+## building ojph_compress
+#########################
+
+file(GLOB OJPH_COMPRESS       "ojph_compress.cpp")
+file(GLOB OJPH_IMG_IO         "../others/ojph_img_io.cpp")
+file(GLOB OJPH_IMG_IO_SSE4    "../others/ojph_img_io_sse41.cpp")
+file(GLOB OJPH_IMG_IO_AVX2    "../others/ojph_img_io_avx2.cpp")
+file(GLOB OJPH_IMG_IO_H       "../common/ojph_img_io.h")
+
+list(APPEND SOURCES ${OJPH_COMPRESS} ${OJPH_IMG_IO} ${OJPH_IMG_IO_H})
+
+source_group("main"        FILES ${OJPH_COMPRESS})
+source_group("others"      FILES ${OJPH_IMG_IO})
+source_group("common"      FILES ${OJPH_IMG_IO_H})
+
+if(EMSCRIPTEN)
+  if (OJPH_ENABLE_WASM_SIMD)
+    list(APPEND SOURCES ${OJPH_IMG_IO_SSE4})
+    source_group("others" FILES ${OJPH_IMG_IO_SSE4})
+    set_source_files_properties(${OJPH_IMG_IO_SSE4} PROPERTIES COMPILE_FLAGS -msse4.1)
+  endif()
+else()
+  if (NOT OJPH_DISABLE_SIMD)
+    if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64") 
+      OR ("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_I386")
+      OR MULTI_GEN_X86_64)
+
+      if (NOT OJPH_DISABLE_SSE4)
+        list(APPEND SOURCES ${OJPH_IMG_IO_SSE4})
+        source_group("others" FILES ${OJPH_IMG_IO_SSE4})
+      endif()
+      if (NOT OJPH_DISABLE_AVX2)
+        list(APPEND SOURCES ${OJPH_IMG_IO_AVX2})
+        source_group("others" FILES ${OJPH_IMG_IO_AVX2})
+      endif()
+
+      # Set compilation flags
+      if (MSVC)
+        set_source_files_properties(${OJPH_IMG_IO_AVX2} PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+      else()
+        set_source_files_properties(${OJPH_IMG_IO_SSE4} PROPERTIES COMPILE_FLAGS -msse4.1)
+        set_source_files_properties(${OJPH_IMG_IO_AVX2} PROPERTIES COMPILE_FLAGS -mavx2)
+      endif()
+    endif()
+
+    if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_ARM") OR MULTI_GEN_ARM64)
+
+    endif()
+
+  endif()
+
+endif()
+
+add_executable(ojph_compress ${SOURCES})
+target_include_directories(ojph_compress PRIVATE ../common)
+target_link_libraries(ojph_compress PRIVATE openjph $<TARGET_NAME_IF_EXISTS:TIFF::TIFF>)
+
+install(TARGETS ojph_compress)
diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp
index b800224c..c4b66c76 100644
--- a/src/apps/ojph_compress/ojph_compress.cpp
+++ b/src/apps/ojph_compress/ojph_compress.cpp
@@ -526,9 +526,9 @@ int main(int argc, char * argv[]) {
     std::cout <<
     "\nThe following arguments are necessary:\n"
 #ifdef OJPH_ENABLE_TIFF_SUPPORT
-    " -i input file name (either pgm, ppm, tif(f), or raw(yuv))\n"
+    " -i input file name (either pgm, ppm, pfm, tif(f), or raw(yuv))\n"
 #else
-    " -i input file name (either pgm, ppm, or raw(yuv))\n"
+    " -i input file name (either pgm, ppm, pfm, or raw(yuv))\n"
 #endif // !OJPH_ENABLE_TIFF_SUPPORT
     " -o output file name\n\n"
 
@@ -538,29 +538,27 @@ int main(int argc, char * argv[]) {
     "               compression; quantization steps size for all subbands are\n"
     "               derived from this value. {The default value for 8bit\n"
     "               images is 0.0039}\n"
-    " -reversible   (false) for irreversible; this should be false to perform\n"
-    "               lossy compression using the 9/7 wavelet transform;\n" 
-    "               or true to perform reversible compression, where\n"
-    "               the 5/3 wavelet is employed with lossless compression.\n"
-    " -colour_trans (true) this option employs a color transform, to\n"
-    "               transform RGB color images into the YUV domain.\n"
-    "               This option should not be used with YUV images, because\n"
+    " -reversible   <true | false> If this is 'false', an irreversible or\n"
+    "               lossy compression is employed, using the 9/7 wavelet\n"
+    "               transform; if 'true', a reversible compression is\n"
+    "               performed, where the 5/3 wavelet is used.\n"
+    "               Default value is 'false'.\n"
+    " -colour_trans <true | false> This option employs a color transform,\n"
+    "               to transform RGB color images into the YUV domain.\n"
+    "               This option should NOT be used with YUV images, because\n"
     "               they have already been transformed.\n"
     "               If there are three color components that are\n"
-    "               downsampled by the same amount then the color transform\n"
-    "               can be true or false. This option is also available\n"
-    "               when there are more than three colour components,\n"
-    "               where it is applied to the first three colour\n"
-    "               components.\n"
-    "               it has already been applied to convert the original RGB\n"
-    "               or whatever the original format to YUV.\n"
+    "               downsampled by the same amount then this option can be\n"
+    "               'true' or 'false'. This option is also available when\n"
+    "               there are more than three colour components, where it is\n"
+    "               applied to the first three colour components.\n"
     " -prog_order   (RPCL) is the progression order, and can be one of:\n"
-    "               LRCP, RLCP, RPCL, PCRL, CPRL\n"
+    "               LRCP, RLCP, RPCL, PCRL, CPRL.\n"
     " -block_size   {x,y} (64,64) where x and y are the height and width of\n"
     "               a codeblock. In unix-like environment, { and } must be\n"
-    "               proceeded by a ""\\""\n"
+    "               preceded by a ""\\"".\n"
     " -precincts    {x,y},{x,y},...,{x,y} where {x,y} is the precinct size\n"
-    "               starting from the coarest resolution; the last precinct\n"
+    "               starting from the coarsest resolution; the last precinct\n"
     "               is repeated for all finer resolutions\n"
     " -tile_offset  {x,y} tile offset. \n"
     " -tile_size    {x,y} tile width and height. \n"
@@ -568,14 +566,15 @@ int main(int argc, char * argv[]) {
     " -tileparts    (None) employ tilepart divisions at each resolution, \n"
     "               indicated by the letter R, and/or component, indicated \n"
     "               by the letter C. For both, use \"-tileparts RC\".\n"
-    " -tlm_marker   (false) insert a TLM marker, either \"true\" or \"false\"\n"
+    " -tlm_marker   <true | false> if 'true', a TLM marker is inserted.\n"
+    "               Default value is false.\n"
     " -profile      (None) is the profile, the code will check if the \n"
     "               selected options meet the profile.  Currently only \n"
     "               BROADCAST and IMF are supported.  This automatically \n"
     "               sets tlm_marker to true and tileparts to C.\n"
     " -com          (None) if set, inserts a COM marker with the specified\n"
     "               string. If the string has spaces, please use\n"
-    "               double quotes, as in -com \"This is a comment\"\n"
+    "               double quotes, as in -com \"This is a comment\".\n"
     "\n"
 
     "When the input file is a YUV file, these arguments need to be \n"
@@ -588,7 +587,33 @@ int main(int argc, char * argv[]) {
     "            component; for example: 12,10,10\n"
     " -downsamp  {x,y},{x,y},...,{x,y} a list of x,y points, one for each\n"
     "            component; for example {1,1},{2,2},{2,2}\n\n"
-    ;
+    "\n"
+
+    ".pfm files receive special treatment. Currently, lossy compression\n"
+    "with these files is not supported, only lossless. When these files are\n"
+    "used, the NLT segment marker is automatically inserted into the\n"
+    "codestream when needed, as explained shortly. The following arguments\n"
+    "can be useful for this file type.\n"
+    " -signed    a comma-separated list of true or false parameters, one\n"
+    "            for each component; for example: true,false,false.\n"
+    "            If you are sure that all sample values are positive or 0,\n"
+    "            set the corresponding entry to false; otherwise set it to\n"
+    "            true.\n"
+    "            When a component entry is set to true, an NLT segment\n"
+    "            marker segment is inserted into the codestream.\n"
+    "            The NLT segment specifies a non-linear transform that\n"
+    "            changes only negative values, producing better coding\n"
+    "            efficiency.\n"
+    "            The NLT segment marker might be less supported in other\n"
+    "            encoders.\n"
+    " -bit_depth a comma-separated list of bit depth values, one per \n"
+    "            component; for example: 12,10,10.\n"
+    "            Floating value numbers are treated as integers, and they\n"
+    "            are shifted to the right, keeping only the specified\n"
+    "            number of bits. Up to 32 bits (which is the default) are\n"
+    "            supported.\n"
+
+    "\n";
     return -1;
   }
   if (!get_arguments(argc, argv, input_filename, output_filename,
@@ -612,6 +637,7 @@ int main(int argc, char * argv[]) {
     ojph::codestream codestream;
 
     ojph::ppm_in ppm;
+    ojph::pfm_in pfm;
     ojph::yuv_in yuv;
     ojph::raw_in raw;
     ojph::dpx_in dpx;
@@ -737,6 +763,106 @@ int main(int argc, char * argv[]) {
 
         base = &ppm;
       }
+      else if (is_matching(".pfm", v))
+      {
+        pfm.open(input_filename);
+        ojph::param_siz siz = codestream.access_siz();
+        siz.set_image_extent(ojph::point(image_offset.x + pfm.get_width(),
+          image_offset.y + pfm.get_height()));
+        ojph::ui32 num_comps = pfm.get_num_components();
+        assert(num_comps == 1 || num_comps == 3);
+        siz.set_num_components(num_comps);
+
+        if (bit_depth[0] != 0)             // one was set
+          if (num_bit_depths < num_comps)  // but if not enough, repeat
+            for (ojph::ui32 c = num_bit_depths; c < num_comps; ++c)
+              bit_depth[c] = bit_depth[num_bit_depths - 1];
+
+        bool all_the_same = true;
+        if (num_comps == 3)
+          all_the_same = all_the_same 
+            && bit_depth[0] == bit_depth[1] 
+            && bit_depth[1] == bit_depth[2];
+
+        for (ojph::ui32 c = 0; c < num_comps; ++c) {
+          if (bit_depth[c] == 0)
+            bit_depth[c] = 32;
+          siz.set_component(c, ojph::point(1,1), bit_depth[c], true);
+        }
+        pfm.configure(bit_depth);
+
+        siz.set_image_offset(image_offset);
+        siz.set_tile_size(tile_size);
+        siz.set_tile_offset(tile_offset);
+
+        ojph::param_cod cod = codestream.access_cod();
+        cod.set_num_decomposition(num_decompositions);
+        cod.set_block_dims(block_size.w, block_size.h);
+        if (num_precincts != -1)
+          cod.set_precinct_size(num_precincts, precinct_size);
+        cod.set_progression_order(prog_order);
+        if (num_comps == 1)
+        {
+          if (employ_color_transform != -1)
+            OJPH_WARN(0x01000091,
+              "-colour_trans option is not needed and was not used; "
+              "this is because the image has one component only\n");
+        }
+        else
+        {
+          if (employ_color_transform == -1)
+            cod.set_color_transform(true);
+          else
+            cod.set_color_transform(employ_color_transform == 1);
+        }
+        cod.set_reversible(reversible);
+        if (!reversible) {
+          const float min_step = 1.0f / 16384.0f;
+          if (quantization_step == -1.0f)
+            quantization_step = min_step;
+          else
+            quantization_step = ojph_max(quantization_step, min_step);
+          codestream.access_qcd().set_irrev_quant(quantization_step);
+        }
+
+        // Note: Even if only ALL_COMPS is set to 
+        // OJPH_NLT_BINARY_COMPLEMENT_NLT, the library can decide if
+        // one ALL_COMPS NLT marker segment is needed, or multiple 
+        // per component NLT marker segments are needed (when the components
+        // have different bit depths or signedness).
+        // Of course for .pfm images all components should have the same
+        // bit depth and signedness.
+        ojph::param_nlt nlt = codestream.access_nlt();
+        if (all_the_same)
+          nlt.set_nonlinear_transform(ojph::param_nlt::ALL_COMPS, 
+            ojph::param_nlt::OJPH_NLT_BINARY_COMPLEMENT_NLT);
+        else
+          for (ojph::ui32 c = 0; c < num_comps; ++c)
+            nlt.set_nonlinear_transform(c, 
+              ojph::param_nlt::OJPH_NLT_BINARY_COMPLEMENT_NLT);
+
+        codestream.set_planar(false);
+        if (profile_string[0] != '\0')
+          codestream.set_profile(profile_string);
+        codestream.set_tilepart_divisions(tileparts_at_resolutions, 
+                                          tileparts_at_components);
+        codestream.request_tlm_marker(tlm_marker);          
+
+        if (dims.w != 0 || dims.h != 0)
+          OJPH_WARN(0x01000092,
+            "-dims option is not needed and was not used\n");
+        if (num_components != 0)
+          OJPH_WARN(0x01000093,
+            "-num_comps is not needed and was not used\n");
+        if (is_signed[0] != -1)
+          OJPH_WARN(0x01000094,
+            "-signed is not needed and was not used\n");            
+        if (comp_downsampling[0].x != 0 || comp_downsampling[0].y != 0)
+          OJPH_WARN(0x01000095,
+            "-downsamp is not needed and was not used\n");
+
+        base = &pfm;
+      }
 #ifdef OJPH_ENABLE_TIFF_SUPPORT
       else if (is_matching(".tif", v) || is_matching(".tiff", v))
       {
@@ -900,7 +1026,7 @@ int main(int argc, char * argv[]) {
         cod.set_progression_order(prog_order);
         if (employ_color_transform != -1)
           OJPH_ERROR(0x01000086,
-            "color transform is not meaningless since .raw files are single "
+            "color transform is meaningless since .raw files are single "
             "component files");
         cod.set_reversible(reversible);
         if (!reversible && quantization_step != -1.0f)
@@ -917,55 +1043,55 @@ int main(int argc, char * argv[]) {
       }
       else if (is_matching(".dpx", v))
       {
-      dpx.open(input_filename);
-      ojph::param_siz siz = codestream.access_siz();
-      siz.set_image_extent(ojph::point(image_offset.x + dpx.get_size().w,
-        image_offset.y + dpx.get_size().h));
-      ojph::ui32 num_comps = dpx.get_num_components();
-      siz.set_num_components(num_comps);
-      //if (num_bit_depths > 0)
-      //  dpx.set_bit_depth(num_bit_depths, bit_depth);
-      for (ojph::ui32 c = 0; c < num_comps; ++c)
-        siz.set_component(c, dpx.get_comp_subsampling(c),
-          dpx.get_bit_depth(c), dpx.get_is_signed(c));
-      siz.set_image_offset(image_offset);
-      siz.set_tile_size(tile_size);
-      siz.set_tile_offset(tile_offset);
-
-      ojph::param_cod cod = codestream.access_cod();
-      cod.set_num_decomposition(num_decompositions);
-      cod.set_block_dims(block_size.w, block_size.h);
-      if (num_precincts != -1)
-        cod.set_precinct_size(num_precincts, precinct_size);
-      cod.set_progression_order(prog_order);
-      if (employ_color_transform == -1 && num_comps >= 3)
-        cod.set_color_transform(true);
-      else
-        cod.set_color_transform(employ_color_transform == 1);
-      cod.set_reversible(reversible);
-      if (!reversible && quantization_step != -1)
-        codestream.access_qcd().set_irrev_quant(quantization_step);
-      codestream.set_planar(false);
-      if (profile_string[0] != '\0')
-        codestream.set_profile(profile_string);
-      codestream.set_tilepart_divisions(tileparts_at_resolutions,
-        tileparts_at_components);
-      codestream.request_tlm_marker(tlm_marker);
-
-      if (dims.w != 0 || dims.h != 0)
-        OJPH_WARN(0x01000071,
-          "-dims option is not needed and was not used\n");
-      if (num_components != 0)
-        OJPH_WARN(0x01000072,
-          "-num_comps is not needed and was not used\n");
-      if (is_signed[0] != -1)
-        OJPH_WARN(0x01000073,
-          "-signed is not needed and was not used\n");
-      if (comp_downsampling[0].x != 0 || comp_downsampling[0].y != 0)
-        OJPH_WARN(0x01000075,
-          "-downsamp is not needed and was not used\n");
-
-      base = &dpx;
+        dpx.open(input_filename);
+        ojph::param_siz siz = codestream.access_siz();
+        siz.set_image_extent(ojph::point(image_offset.x + dpx.get_size().w,
+          image_offset.y + dpx.get_size().h));
+        ojph::ui32 num_comps = dpx.get_num_components();
+        siz.set_num_components(num_comps);
+        //if (num_bit_depths > 0)
+        //  dpx.set_bit_depth(num_bit_depths, bit_depth);
+        for (ojph::ui32 c = 0; c < num_comps; ++c)
+          siz.set_component(c, dpx.get_comp_subsampling(c),
+            dpx.get_bit_depth(c), dpx.get_is_signed(c));
+        siz.set_image_offset(image_offset);
+        siz.set_tile_size(tile_size);
+        siz.set_tile_offset(tile_offset);
+
+        ojph::param_cod cod = codestream.access_cod();
+        cod.set_num_decomposition(num_decompositions);
+        cod.set_block_dims(block_size.w, block_size.h);
+        if (num_precincts != -1)
+          cod.set_precinct_size(num_precincts, precinct_size);
+        cod.set_progression_order(prog_order);
+        if (employ_color_transform == -1 && num_comps >= 3)
+          cod.set_color_transform(true);
+        else
+          cod.set_color_transform(employ_color_transform == 1);
+        cod.set_reversible(reversible);
+        if (!reversible && quantization_step != -1)
+          codestream.access_qcd().set_irrev_quant(quantization_step);
+        codestream.set_planar(false);
+        if (profile_string[0] != '\0')
+          codestream.set_profile(profile_string);
+        codestream.set_tilepart_divisions(tileparts_at_resolutions,
+          tileparts_at_components);
+        codestream.request_tlm_marker(tlm_marker);
+
+        if (dims.w != 0 || dims.h != 0)
+          OJPH_WARN(0x01000071,
+            "-dims option is not needed and was not used\n");
+        if (num_components != 0)
+          OJPH_WARN(0x01000072,
+            "-num_comps is not needed and was not used\n");
+        if (is_signed[0] != -1)
+          OJPH_WARN(0x01000073,
+            "-signed is not needed and was not used\n");
+        if (comp_downsampling[0].x != 0 || comp_downsampling[0].y != 0)
+          OJPH_WARN(0x01000075,
+            "-downsamp is not needed and was not used\n");
+
+        base = &dpx;
       }
       else
 #if defined( OJPH_ENABLE_TIFF_SUPPORT)
diff --git a/src/apps/ojph_expand/CMakeLists.txt b/src/apps/ojph_expand/CMakeLists.txt
new file mode 100644
index 00000000..a0abda5f
--- /dev/null
+++ b/src/apps/ojph_expand/CMakeLists.txt
@@ -0,0 +1,58 @@
+## building ojph_expand
+#######################
+
+file(GLOB OJPH_EXPAND         "ojph_expand.cpp")
+file(GLOB OJPH_IMG_IO         "../others/ojph_img_io.cpp")
+file(GLOB OJPH_IMG_IO_SSE4    "../others/ojph_img_io_sse41.cpp")
+file(GLOB OJPH_IMG_IO_AVX2    "../others/ojph_img_io_avx2.cpp")
+file(GLOB OJPH_IMG_IO_H       "../common/ojph_img_io.h")
+
+list(APPEND SOURCES ${OJPH_EXPAND} ${OJPH_IMG_IO} ${OJPH_IMG_IO_H})
+
+source_group("main"        FILES ${OJPH_EXPAND})
+source_group("others"      FILES ${OJPH_IMG_IO})
+source_group("common"      FILES ${OJPH_IMG_IO_H})
+
+if(EMSCRIPTEN)
+  if (OJPH_ENABLE_WASM_SIMD)
+    list(APPEND SOURCES ${OJPH_IMG_IO_SSE4})
+    source_group("others" FILES ${OJPH_IMG_IO_SSE4})
+    set_source_files_properties(${OJPH_IMG_IO_SSE4} PROPERTIES COMPILE_FLAGS -msse4.1)
+  endif()
+else()
+  if (NOT OJPH_DISABLE_SIMD)
+    if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64")
+      OR ("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_I386")
+      OR MULTI_GEN_X86_64)
+
+      if (NOT OJPH_DISABLE_SSE4)
+        list(APPEND SOURCES ${OJPH_IMG_IO_SSE4})
+        source_group("others" FILES ${OJPH_IMG_IO_SSE4})
+      endif()
+      if (NOT OJPH_DISABLE_AVX2)
+        list(APPEND SOURCES ${OJPH_IMG_IO_AVX2})
+        source_group("others" FILES ${OJPH_IMG_IO_AVX2})
+      endif()
+
+      # Set compilation flags
+      if (MSVC)
+        set_source_files_properties(${OJPH_IMG_IO_AVX2} PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+      else()
+        set_source_files_properties(${OJPH_IMG_IO_SSE4} PROPERTIES COMPILE_FLAGS -msse4.1)
+        set_source_files_properties(${OJPH_IMG_IO_AVX2} PROPERTIES COMPILE_FLAGS -mavx2)
+      endif()
+    endif()
+
+    if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_ARM") OR MULTI_GEN_ARM64)
+
+    endif()
+
+  endif()
+
+endif()
+
+add_executable(ojph_expand ${SOURCES})
+target_include_directories(ojph_expand PRIVATE ../common)
+target_link_libraries(ojph_expand PRIVATE openjph $<TARGET_NAME_IF_EXISTS:TIFF::TIFF>)
+
+install(TARGETS ojph_expand)
diff --git a/src/apps/ojph_expand/ojph_expand.cpp b/src/apps/ojph_expand/ojph_expand.cpp
index dfee3cef..44fe4c5b 100644
--- a/src/apps/ojph_expand/ojph_expand.cpp
+++ b/src/apps/ojph_expand/ojph_expand.cpp
@@ -63,14 +63,14 @@ struct ui32_list_interpreter : public ojph::cli_interpreter::arg_inter_base
     {
       if (num_eles)
       {
-        if (*next_char != ',') //separate sizes by a comma
-          throw "sizes in a sizes list must be separated by a comma";
+        if (*next_char != ',') //separate res by a comma
+          throw "resolutions in a list must be separated by a comma";
         next_char++;
       }
       char *endptr;
       si32list[num_eles] = (ojph::ui32)strtoul(next_char, &endptr, 10);
       if (endptr == next_char)
-        throw "size number is improperly formatted";
+        throw "resolution number is improperly formatted";
       next_char = endptr;
       ++num_eles;
     }
@@ -81,7 +81,7 @@ struct ui32_list_interpreter : public ojph::cli_interpreter::arg_inter_base
         throw "list elements must separated by a "",""";
     }
     else if (*next_char)
-        throw "there are too many elements in the size list";
+        throw "there are too many elements in the resolution list";
   }
 
   const int max_num_eles;
@@ -173,13 +173,13 @@ int main(int argc, char *argv[]) {
   if (argc <= 1) {
     std::cout <<
     "\nThe following arguments are necessary:\n"
-    " -i input file name\n"
+    " -i <input file name>\n"
 #ifdef OJPH_ENABLE_TIFF_SUPPORT
-    " -o output file name (either pgm, ppm, tif(f), or raw(yuv))\n\n"
+    " -o <output file name> (either pgm, ppm, tif(f), or raw(yuv))\n\n"
 #else
-    " -o output file name (either pgm, ppm, or raw(yuv))\n\n"
+    " -o <output file name> (either pgm, ppm, or raw(yuv))\n\n"
 #endif // !OJPH_ENABLE_TIFF_SUPPORT
-    "The following arguments are options:\n"
+    "The following arguments are optional:\n"
     " -skip_res  x,y a comma-separated list of two elements containing the\n"
     "            number of resolutions to skip. You can specify 1 or 2\n"
     "            parameters; the first specifies the number of resolution\n"
@@ -187,8 +187,10 @@ int main(int argc, char *argv[]) {
     "            number of skipped resolution for reconstruction, which is\n"
     "            either equal to the first or smaller. If the second is not\n"
     "            specified, it is made to equal to the first.\n"
-    " -resilient true if you want the decoder to be more tolerant of errors\n"
-    "            in the codestream\n\n"
+    " -resilient <true | false> if 'true', the decoder will not exit when\n"
+    "            running into recoverable errors in the codestream.\n"
+    "            Default: 'false'.\n"
+    "\n"
     ;
     return -1;
   }
@@ -203,14 +205,15 @@ int main(int argc, char *argv[]) {
 
   try {
     if (output_filename == NULL)
-      OJPH_ERROR(0x020000008,
-                 "Please provide and output file using the -o option\n");
+      OJPH_ERROR(0x02000001,
+                 "Please provide an output file using the -o option\n");
 
     ojph::j2c_infile j2c_file;
     j2c_file.open(input_filename);
     ojph::codestream codestream;
 
     ojph::ppm_out ppm;
+    ojph::pfm_out pfm;
     #ifdef OJPH_ENABLE_TIFF_SUPPORT
     ojph::tif_out tif;
     #endif /* OJPH_ENABLE_TIFF_SUPPORT */
@@ -231,9 +234,9 @@ int main(int argc, char *argv[]) {
       {
 
         if (siz.get_num_components() != 1)
-          OJPH_ERROR(0x020000001,
+          OJPH_ERROR(0x02000002,
             "The file has more than one color component, but .pgm can "
-            "contain only on color component\n");
+            "contain only one color component\n");
         ppm.configure(siz.get_recon_width(0), siz.get_recon_height(0),
                       siz.get_num_components(), siz.get_bit_depth(0));
         ppm.open(output_filename);
@@ -245,7 +248,7 @@ int main(int argc, char *argv[]) {
         ojph::param_siz siz = codestream.access_siz();
 
         if (siz.get_num_components() != 3)
-          OJPH_ERROR(0x020000002,
+          OJPH_ERROR(0x02000003,
             "The file has %d color components; this cannot be saved to"
             " a .ppm file\n", siz.get_num_components());
         bool all_same = true;
@@ -256,14 +259,46 @@ int main(int argc, char *argv[]) {
           all_same = all_same && (p1.x == p.x) && (p1.y == p.y);
         }
         if (!all_same)
-          OJPH_ERROR(0x020000003,
+          OJPH_ERROR(0x02000004,
             "To save an image to ppm, all the components must have the "
-            "downsampling ratio\n");
+            "same downsampling ratio\n");
         ppm.configure(siz.get_recon_width(0), siz.get_recon_height(0),
                       siz.get_num_components(), siz.get_bit_depth(0));
         ppm.open(output_filename);
         base = &ppm;
       }
+      else if (is_matching(".pfm", v))
+      {
+        OJPH_INFO(0x02000010, "Note: The .pfm implementation is "
+          "experimental.  Here, we are assuming that the original data is "
+          "floating-point numbers.");
+
+        codestream.set_planar(false);
+        ojph::param_siz siz = codestream.access_siz();
+
+        ojph::ui32 num_comps = siz.get_num_components();
+        if (num_comps != 3 && num_comps != 1)
+          OJPH_ERROR(0x0200000C,
+            "The file has %d color components; this cannot be saved to"
+            " a .pfm file", num_comps);
+        bool all_same = true;
+        ojph::point p = siz.get_downsampling(0);
+        for (ojph::ui32 i = 1; i < siz.get_num_components(); ++i) {
+          ojph::point p1 = siz.get_downsampling(i);
+          all_same = all_same && (p1.x == p.x) && (p1.y == p.y);
+        }
+        if (!all_same)
+          OJPH_ERROR(0x0200000D,
+            "To save an image to ppm, all the components must have the "
+            "same downsampling ratio");
+        ojph::ui32 bit_depth[3];
+        for (ojph::ui32 c = 0; c < siz.get_num_components(); ++c)
+          bit_depth[c] = siz.get_bit_depth(c);
+        pfm.configure(siz.get_recon_width(0), siz.get_recon_height(0),
+          siz.get_num_components(), -1.0f, bit_depth);
+        pfm.open(output_filename);
+        base = &pfm;
+      }
 #ifdef OJPH_ENABLE_TIFF_SUPPORT
       else if (is_matching(".tif", v) || is_matching(".tiff", v))
       {
@@ -278,9 +313,9 @@ int main(int argc, char *argv[]) {
           all_same = all_same && (p1.x == p.x) && (p1.y == p.y);
         }
         if (!all_same)
-          OJPH_ERROR(0x020000008,
+          OJPH_ERROR(0x02000005,
             "To save an image to tif(f), all the components must have the "
-            "downsampling ratio\n");
+            "same downsampling ratio\n");
         ojph::ui32 bit_depths[4] = { 0, 0, 0, 0 };
         for (ojph::ui32 c = 0; c < siz.get_num_components(); c++)
         {
@@ -298,12 +333,12 @@ int main(int argc, char *argv[]) {
         ojph::param_siz siz = codestream.access_siz();
 
         if (siz.get_num_components() != 3 && siz.get_num_components() != 1)
-          OJPH_ERROR(0x020000004,
+          OJPH_ERROR(0x02000006,
             "The file has %d color components; this cannot be saved to"
              " .yuv file\n", siz.get_num_components());
         ojph::param_cod cod = codestream.access_cod();
         if (cod.is_using_color_transform())
-          OJPH_ERROR(0x020000005,
+          OJPH_ERROR(0x02000007,
             "The current implementation of yuv file object does not"
             " support saving file when conversion from yuv to rgb is"
             " needed; in any case, this is not the normal usage of yuv"
@@ -325,7 +360,7 @@ int main(int argc, char *argv[]) {
         ojph::param_siz siz = codestream.access_siz();
 
         if (siz.get_num_components() != 1)
-          OJPH_ERROR(0x020000006,
+          OJPH_ERROR(0x02000008,
             "The file has %d color components; this cannot be saved to"
             " .raw file (only one component is allowed).\n", 
             siz.get_num_components());
@@ -338,17 +373,17 @@ int main(int argc, char *argv[]) {
       }
       else
 #ifdef OJPH_ENABLE_TIFF_SUPPORT
-        OJPH_ERROR(0x020000007,
+        OJPH_ERROR(0x02000009,
           "unknown output file extension; only pgm, ppm, tif(f) and raw(yuv))"
           " are supported\n");
 #else
-        OJPH_ERROR(0x020000006,
+        OJPH_ERROR(0x0200000A,
           "unknown output file extension; only pgm, ppm, and raw(yuv) are"
           " supported\n");
 #endif // !OJPH_ENABLE_TIFF_SUPPORT
     }
     else
-      OJPH_ERROR(0x020000007,
+      OJPH_ERROR(0x0200000B,
         "Please supply a proper output filename with a proper extension\n");
 
     codestream.create();
diff --git a/src/apps/ojph_stream_expand/CMakeLists.txt b/src/apps/ojph_stream_expand/CMakeLists.txt
new file mode 100644
index 00000000..c29c1178
--- /dev/null
+++ b/src/apps/ojph_stream_expand/CMakeLists.txt
@@ -0,0 +1,25 @@
+## building ojph_stream_expand
+##############################
+
+find_package(Threads)
+
+file(GLOB OJPH_STREAM_EXPAND  "*.cpp" "*.h")
+file(GLOB OJPH_SOCKETS         "../others/ojph_sockets.cpp")
+file(GLOB OJPH_SOCKETS_H       "../common/ojph_sockets.h")
+file(GLOB OJPH_THREADS        "../others/ojph_threads.cpp")
+file(GLOB OJPH_THREADS_H      "../common/ojph_threads.h")
+
+list(APPEND SOURCES ${OJPH_STREAM_EXPAND} ${OJPH_SOCKETS} ${OJPH_SOCKETS_H} ${OJPH_THREADS} ${OJPH_THREADS_H})
+
+source_group("main"        FILES ${OJPH_STREAM_EXPAND})
+source_group("others"      FILES ${OJPH_SOCKETS} ${OJPH_THREADS})
+source_group("common"      FILES ${OJPH_SOCKETS_H} ${OJPH_THREADS_H})
+
+add_executable(ojph_stream_expand ${SOURCES})
+target_include_directories(ojph_stream_expand PRIVATE ../common)
+target_link_libraries(ojph_stream_expand PRIVATE openjph Threads::Threads)
+if(WIN32)
+    target_link_libraries(ojph_stream_expand PRIVATE ws2_32)
+endif()
+
+install(TARGETS ojph_stream_expand)
diff --git a/src/apps/ojph_stream_expand/ojph_stream_expand.cpp b/src/apps/ojph_stream_expand/ojph_stream_expand.cpp
new file mode 100644
index 00000000..641d3ef8
--- /dev/null
+++ b/src/apps/ojph_stream_expand/ojph_stream_expand.cpp
@@ -0,0 +1,373 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2024, Aous Naman
+// Copyright (c) 2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2024, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_stream_expand.cpp
+// Author: Aous Naman
+// Date: 17 April 2024
+//***************************************************************************/
+
+#include <iostream>
+#include "ojph_message.h"
+#include "ojph_arg.h"
+#include "ojph_sockets.h"
+#include "ojph_threads.h"
+#include "stream_expand_support.h"
+
+#ifdef OJPH_OS_WINDOWS
+
+#else
+  #include <arpa/inet.h>
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+static
+bool get_arguments(int argc, char *argv[],
+                   char *&recv_addr, char *&recv_port, 
+                   char *&src_addr, char *&src_port, 
+                   char *&target_name, ojph::ui32& num_threads, 
+                   ojph::ui32& num_inflight_packets,
+                   ojph::ui32& recvfrm_buf_size, bool& blocking,
+                   bool& quiet)
+{
+  ojph::cli_interpreter interpreter;
+  interpreter.init(argc, argv);
+
+  interpreter.reinterpret("-addr", recv_addr);
+  interpreter.reinterpret("-port", recv_port);
+  interpreter.reinterpret("-src_addr", src_addr);
+  interpreter.reinterpret("-src_port", src_port);
+  interpreter.reinterpret("-o", target_name);
+  interpreter.reinterpret("-num_threads", num_threads);
+  interpreter.reinterpret("-num_packets", num_inflight_packets);
+  interpreter.reinterpret("-recv_buf_size", recvfrm_buf_size);
+
+  blocking = interpreter.reinterpret("-blocking");
+  quiet = interpreter.reinterpret("-quiet");
+
+  if (interpreter.is_exhausted() == false) {
+    printf("The following arguments were not interpreted:\n");
+    ojph::argument t = interpreter.get_argument_zero();
+    t = interpreter.get_next_avail_argument(t);
+    while (t.is_valid()) {
+      printf("%s\n", t.arg);
+      t = interpreter.get_next_avail_argument(t);
+    }
+    return false;
+  }
+
+  if (recv_addr == NULL)
+  {
+    printf("Please use \"-addr\" to provide a receiving address, "
+      "\"localhost\" or a local network card IPv4 address.\n");
+    return false;
+  }
+  if (recv_port == NULL)
+  {
+    printf("Please use \"-port\" to provide a port number.\n");
+    return false;
+  }
+  if (num_threads < 1)
+  {
+    printf("Please set \"-num_threads\" to 1 or more.\n");
+    return false;
+  }
+  if (num_inflight_packets < 1)
+  {
+    printf("Please set \"-num_packets\" to 1 or more.\n");
+    return false;
+  }
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+int main(int argc, char* argv[])
+{
+  char *recv_addr = NULL;
+  char *recv_port = NULL;
+  char *src_addr = NULL;
+  char *src_port = NULL;
+  char *target_name = NULL;
+  ojph::ui32 num_threads = 2;
+  ojph::ui32 num_inflight_packets = 5;
+  ojph::ui32 recvfrm_buf_size = 65536;
+  bool blocking = false;
+  bool quiet = false;
+	
+  if (argc <= 1) {
+    printf(
+    "\n"
+    "The following arguments are necessary:\n"
+    " -addr          <receiving IPv4 address>, or\n"
+    "                The address should be either localhost, or\n"
+    "                a local network card IPv4 address\n"
+    "                example: -addr 127.0.0.1\n"
+    " -port          <listening port>\n"
+    "\n"
+    "The following arguments are options:\n"
+    " -src_addr      <source ipv4 address>, packets from other sources\n"
+    "                will be ignored. If not specified, then packets\n"
+    "                from any source are accepted.\n"
+    " -src_port      <source port>, packets from other source ports are\n"    
+    "                ignored. If not specified, then packets from any\n"
+    "                port are accepted -- I would recommend not leaving\n"
+    "                this one out.\n"
+    " -recv_buf_size <integer> recvfrom buffer size; default is 65536.\n"
+    "                This is the size of the operating system's receive\n"
+    "                buffer, before packets are picked by the program.\n"
+    "                Larger buffers reduces the likelihood that a packet\n"
+    "                is dropped before the program has a chance to pick it.\n"
+    " -blocking      sets the receiving socket blocking mode to blocking.\n"
+    "                The default mode is non-blocking. A blocking socket\n"
+    "                increases the likelihood of not receiving some\n"
+    "                packets; this is because the thread get into sleep\n"
+    "                state, and therefore takes sometime to wakeup. A\n"
+    "                non-blocking socket increase power consumption,\n"
+    "                because it prevents the thread from sleeping.\n"
+    " -num_threads   <integer> number of threads for decoding and\n"
+    "                displaying files.  This number also determines the\n"
+    "                number of in-flight files, not completely\n"
+    "                saved/processed yet. The number of files is set to\n"
+    "                number of threads + 1\n"
+    " -num_packets   <integer> number of in-flight packets; this is a\n"
+    "                window of packets in which packets can be re-ordered.\n"
+    " -o             <string> target file name without extension; the same\n"
+    "                printf formating can be used. For example,\n"
+    "                output_%%05d. An extension will be added, either .j2c\n"
+    "                for original frames, or .ppm for decoded images.\n"
+    " -quiet         use to stop printing informative messages.\n."
+    "\n"
+    );
+    exit(-1);
+  }
+  if (!get_arguments(argc, argv, recv_addr, recv_port, src_addr, src_port,
+                     target_name, num_threads, num_inflight_packets,
+                     recvfrm_buf_size, blocking, quiet))
+  {
+    exit(-1);
+  }
+
+  try {
+    ojph::thds::thread_pool thread_pool;
+    thread_pool.init(num_threads);
+    ojph::stex::frames_handler frames_handler;
+    frames_handler.init(quiet, target_name, &thread_pool);
+    ojph::stex::packets_handler packets_handler;
+    packets_handler.init(quiet, num_inflight_packets, &frames_handler);
+    ojph::net::socket_manager smanager;
+
+    // listening address/port
+    struct sockaddr_in server;
+    {
+      server.sin_family = AF_INET;
+      const char *p = recv_addr;
+      const char localhost[] = "127.0.0.1";
+      if (strcmp(recv_addr, "localhost") == 0)
+        p = localhost;
+      int result = inet_pton(AF_INET, p, &server.sin_addr);
+      if (result != 1)
+        OJPH_ERROR(0x02000001, "Please provide a valid IPv4 address when "
+          "using \"-addr,\" the provided address %s is not valid", 
+          recv_addr);
+      ojph::ui16 port_number = 0;
+      port_number = (ojph::ui16)atoi(recv_port);
+      if (port_number == 0)
+        OJPH_ERROR(0x02000002, "Please provide a valid port number. "
+            "The number you provided is %d", recv_port);
+      server.sin_port = htons(port_number);
+    }
+
+    // create a socket
+    ojph::net::socket s;
+    s = smanager.create_socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
+    if(s.intern() == OJPH_INVALID_SOCKET)
+    {
+      std::string err = smanager.get_last_error_message();
+      OJPH_ERROR(0x02000003, "Could not create socket: %s", err.data());
+    }
+
+    // change recv buffer size; default is 65536
+    if (::setsockopt(s.intern(), SOL_SOCKET, SO_RCVBUF,
+                   (char*)&recvfrm_buf_size, sizeof(recvfrm_buf_size)) == -1)
+    {
+      std::string err = smanager.get_last_error_message();
+      OJPH_INFO(0x02000001,
+        "Failed to expand receive buffer: %s", err.data());
+    }
+
+    // set socket to non-blocking
+    if (s.set_blocking_mode(blocking) == false)
+    {
+      std::string err = smanager.get_last_error_message();
+      OJPH_INFO(0x02000002,
+        "Failed to set the socket's blocking mode to %s, with error %s", 
+        blocking ? "blocking" : "non-blocking", err.data());
+    }
+
+    // bind to listening address
+    if(bind(s.intern(), (struct sockaddr *)&server, sizeof(server)) == -1)
+    {
+      std::string err = smanager.get_last_error_message();
+      OJPH_ERROR(0x02000004, 
+        "Could not bind address to socket: %s", err.data());
+    }
+
+    if (!quiet) {
+      constexpr int buf_size = 128;
+      char buf[buf_size];
+      ojph::ui32 addr = smanager.get_addr(server);
+      const char* t = inet_ntop(AF_INET, &addr, buf, buf_size);
+      if (t == NULL) {
+        std::string err = smanager.get_last_error_message();
+        OJPH_INFO(0x02000005,
+          "Error converting source address: %s", err.data());
+      }
+      printf("Listening on %s, port %d\n", t, ntohs(server.sin_port));
+    }
+
+    // process the source IPv4 address and port
+    ojph::ui32 saddr = 0;
+    if (src_addr)
+    {
+      const char *p = src_addr;
+      const char localhost[] = "127.0.0.1";
+      if (strcmp(src_addr, "localhost") == 0)
+        p = localhost;
+      struct sockaddr_in t;
+      int result = inet_pton(AF_INET, p, &t.sin_addr);
+      if (result != 1)
+        OJPH_ERROR(0x02000005, "Please provide a valid IPv4 address when "
+          "using \"-src_addr,\" the provided address %s is not valid", 
+          src_addr);
+      saddr = smanager.get_addr(t);
+    }
+    ojph::ui16 sport = 0;
+    if (src_addr)
+    {
+      sport = (ojph::ui16)atoi(src_port);
+      if (sport == 0)
+        OJPH_ERROR(0x02000006, "Please provide a valid port number. "
+            "The number you provided is %d", src_port);
+    }
+
+    // listen to incoming data, and forward it to packet_handler
+    struct sockaddr_in si_other;
+    socklen_t socklen = sizeof(si_other);
+    bool src_printed = false;
+    ojph::stex::rtp_packet* packet = NULL;
+    ojph::ui32 last_time_stamp = 0;
+    while (1)
+    {
+      if (packet == NULL || packet->num_bytes != 0)
+        packet = packets_handler.exchange(packet);
+      if (packet == NULL)
+        continue;
+      packet->num_bytes = 0;
+
+      // receive data
+      int num_bytes = (int)recvfrom(s.intern(), (char*)packet->data,
+        packet->max_size, 0, (struct sockaddr*)&si_other, &socklen);
+
+      if (num_bytes < 0) // error or non-blocking call
+      {
+        int last_error = smanager.get_last_error();
+        if (last_error != OJPH_EWOULDBLOCK)
+        {
+          std::string err = smanager.get_error_message(last_error);
+          OJPH_INFO(0x02000003, "Failed to receive data: %s", err.data());
+        }
+        continue; // if we wish to continue
+      }
+
+      if ((src_addr && saddr != smanager.get_addr(si_other)) ||
+        (src_port && sport != si_other.sin_port)) {
+        constexpr int buf_size = 128;
+        char buf[buf_size];
+        ojph::ui32 addr = smanager.get_addr(si_other);
+        const char* t = inet_ntop(AF_INET, &addr, buf, buf_size);
+        if (t == NULL) {
+          std::string err = smanager.get_last_error_message();
+          OJPH_INFO(0x02000004,
+            "Error converting source address: %s", err.data());
+        }
+        printf("Source mismatch %s, port %d\n",
+          t, ntohs(si_other.sin_port));
+        continue;
+      }
+
+      packet->num_bytes = (ojph::ui32)num_bytes;
+
+      if (last_time_stamp == 0)
+        last_time_stamp = packet->get_time_stamp();
+
+      if (!quiet && !src_printed)
+      {
+        constexpr int buf_size = 128;
+        char buf[buf_size];
+        ojph::ui32 addr = smanager.get_addr(si_other);
+        const char* t = inet_ntop(AF_INET, &addr, buf, buf_size);
+        if (t == NULL) {
+          std::string err = smanager.get_last_error_message();
+          OJPH_INFO(0x02000005, 
+            "Error converting source address: %s", err.data());
+        }
+        printf("Receiving data from %s, port %d\n",
+          t, ntohs(si_other.sin_port));
+        src_printed = true;
+      }
+
+      if (!quiet)
+        if (packet->get_time_stamp() >= last_time_stamp + 45000)
+        { // One second is 90000
+          last_time_stamp = packet->get_time_stamp();
+          ojph::ui32 lost_packets = packets_handler.get_num_lost_packets();
+          ojph::ui32 total_frames = 0, trunc_frames = 0, lost_frames = 0;
+          frames_handler.get_stats(total_frames, trunc_frames, lost_frames);
+
+          printf("Total frame %d, truncated frames %d, lost frames %d, "
+            "packets lost %d\n",
+            total_frames, trunc_frames, lost_frames, lost_packets);
+        }
+    }
+    s.close();    
+  }
+  catch (const std::exception& e)
+  {
+    const char *p = e.what();
+    if (strncmp(p, "ojph error", 10) != 0)
+      printf("%s\n", p);
+    exit(-1);
+  }    
+
+  return 0;
+}
+
diff --git a/src/apps/ojph_stream_expand/stream_expand_support.cpp b/src/apps/ojph_stream_expand/stream_expand_support.cpp
new file mode 100644
index 00000000..1a56d40c
--- /dev/null
+++ b/src/apps/ojph_stream_expand/stream_expand_support.cpp
@@ -0,0 +1,464 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2024, Aous Naman
+// Copyright (c) 2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2024, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: stream_expand_support.h
+// Author: Aous Naman
+// Date: 18 April 2024
+//***************************************************************************/
+
+#include <cassert>
+#include <cstddef>
+#include "ojph_threads.h"
+#include "threaded_frame_processors.h"
+#include "stream_expand_support.h"
+
+namespace ojph
+{
+namespace stex
+{
+
+///////////////////////////////////////////////////////////////////////////////
+// 
+// 
+// static comparison functions
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Compares two 32 bit values, A with B, with the possibility A or B has 
+// undergone overflow. This problem has no proper solution, but here we 
+// assume that the value B approximately divides the space into two regions, 
+// a region larger than B and a region smaller than B.  This leaves one 
+// undetermined value that lies at the opposite end of B, a case we just 
+// ignore -- it is part of smaller.
+// NB: This is my current thinking -- I might be wrong
+static inline bool is_greater32(ui32 a, ui32 b)
+{ 
+  ui32 c = a - b; 
+  return (c > 0u && c <= 0x7FFFFFFFu);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Compares two 32 bit values, A with B, with the possibility A or B has 
+// undergone overflow. This problem has no proper solution, but here we 
+// assume that the value B approximately divides the space into two regions, 
+// a region larger than B and a region smaller than B.  This leaves one 
+// undetermined value that lies at the opposite end of B, a case we just 
+// ignore -- it is part of smaller.
+// NB: This is my current thinking -- I might be wrong
+static inline bool is_smaller32(ui32 a, ui32 b)
+{ 
+  ui32 c = a - b;
+  return (c >= 0x80000000u && c <= 0xFFFFFFFFu);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+static inline bool is_greater24(ui32 a, ui32 b)
+{ return is_greater32(a << 8, b << 8); }
+
+///////////////////////////////////////////////////////////////////////////////
+static inline bool is_smaller24(ui32 a, ui32 b)
+{ return is_smaller32(a << 8, b << 8); }
+
+///////////////////////////////////////////////////////////////////////////////
+static inline ui32 clip_seq_num(ui32 n) { return (n & 0xFFFFFF); }
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+void packets_handler::init(bool quiet, ui32 num_packets,
+                           frames_handler* frames)
+{ 
+  assert(this->num_packets == 0);
+  avail = packet_store = new rtp_packet[num_packets];
+  ui32 i = 0;
+  for (; i < num_packets - 1; ++i)
+    packet_store[i].init(packet_store + i + 1);
+  packet_store[i].init(NULL);
+  this->quiet = quiet;
+  this->num_packets = num_packets; 
+  this->frames = frames;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+rtp_packet* packets_handler::exchange(rtp_packet* p)
+{
+  assert(num_packets > 0 && p == in_use);
+
+  if (p != NULL) {
+    if (p->num_bytes == 0)
+      return p;
+
+    if (last_seq_num == 0) // initialization
+      last_seq_num = clip_seq_num(p->get_seq_num() - 1);
+
+    // packet is old, and is ignored -- no need to included it in the 
+    // lost packets, because this packet was considered lost previously.
+    // This also captures the case where the previous packet and this packet
+    // has the same sequence number, which is rather weird but possible
+    // if some intermediate network unit retransmits packets.
+    if (is_smaller24(p->get_seq_num(), clip_seq_num(last_seq_num + 1)))
+      return p;
+    else if (p->get_seq_num() == clip_seq_num(last_seq_num + 1))
+    {
+      consume_packet();
+      // see if we can push one packet from the top of the buffer
+      if (in_use && in_use->get_seq_num() == clip_seq_num(last_seq_num + 1))
+        consume_packet();
+    }
+    else // sequence larger than expected
+    {
+      // Place the packet in the in_use queue according to its sequence
+      // number; we may have to move it down the queue. The in_use queue is 
+      // always arranged in an ascending order, where the top of the queue 
+      // (pointed to by in_use) has the smallest sequence number.
+      if (in_use->next != NULL) // we have more than 1 packet in queue
+      { 
+        rtp_packet* t = in_use;
+        while (t->next != NULL && 
+          is_greater24(p->get_seq_num(), t->next->get_seq_num()))
+          t = t->next;
+
+        if (t->next != NULL && p->get_seq_num() == t->next->get_seq_num())
+        { // this is a repeated packet and must be removed
+          in_use = in_use->next;
+          p->next = avail;
+          avail = p;
+        }
+        else {
+          if (t == in_use) // at front of queue -- exactly where it should be
+          { } // do nothing
+          else if (t->next == NULL) { // at the end of queue
+            in_use = in_use->next; // remove p from the queue
+            t->next = p;
+            p->next = NULL;
+          }
+          else { // in the middle of the queue
+            in_use = in_use->next; // p removed from the start of queue
+            p->next = t->next;
+            t->next = p;
+          }
+        }
+      }
+
+      // If avail == NULL, all packets are being used (in_use), meaning 
+      // the queue is already full. We push packets from to the top of in_use
+      // queue.
+      // If avail != NULL, we push one packet from the top of the buffer, 
+      // if it has the correct sequence number.
+      if (avail == NULL || 
+          in_use->get_seq_num() == clip_seq_num(last_seq_num + 1))
+      {
+        if (avail == NULL)
+          lost_packets += 
+            in_use->get_seq_num() - clip_seq_num(last_seq_num + 1);
+        consume_packet();
+        if (in_use && in_use->get_seq_num() == clip_seq_num(last_seq_num + 1))
+            consume_packet();
+      }
+    }
+  }
+
+  // move from avail to in_use -- there must be at least one packet in avail
+  assert(avail != NULL);
+  p = avail;
+  avail = avail->next;
+  p->next = in_use;
+  in_use = p;
+  return p;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void packets_handler::flush()
+{
+  // move all packets from in_use to avail
+  while (in_use)
+  {
+    rtp_packet *p = in_use;
+    in_use = in_use->next;
+    p->next = avail;
+    avail = p;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void packets_handler::consume_packet()
+{
+  last_seq_num = in_use->get_seq_num();
+  frames->push(in_use);
+  // move pack from in_use to avail; the packet must be equal to in_use
+  rtp_packet* p = in_use;
+  in_use = in_use->next;
+  p->next = avail;
+  avail = p;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+void stex_file::notify_file_completion()
+{ 
+  int t = done.fetch_add(-1, std::memory_order_acq_rel);
+  if (t == 1) // done is 0
+    parent->increment_num_complete_files();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+frames_handler::~frames_handler()
+{ 
+  if (storers_store)
+    delete[] storers_store;
+  if (files_store) 
+    delete[] files_store; 
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void frames_handler::init(bool quiet, const char *target_name, 
+                          thds::thread_pool* thread_pool)
+{
+  this->quiet = quiet;
+  this->num_threads = (ui32)thread_pool->get_num_threads();
+  this->target_name = target_name;
+  num_files = num_threads + 1;
+  avail = files_store = new stex_file[num_files];
+  storers_store = new j2k_frame_storer[num_files];
+  ui32 i = 0;
+  for (; i < num_files - 1; ++i) {
+    files_store[i].f.open(2 << 20, false); 
+    files_store[i].f.close();
+    files_store[i].init(this, files_store + i + 1, storers_store + i,
+      target_name);
+    storers_store[i].init(files_store + i, target_name);
+  }
+  files_store[i].f.open(2 << 20, false); 
+  files_store[i].f.close();
+  files_store[i].init(this, NULL, storers_store + i, target_name);
+  storers_store[i].init(files_store + i, target_name);
+  this->thread_pool = thread_pool;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void frames_handler::push(rtp_packet* p)
+{
+  assert(!is_smaller32(p->get_time_stamp(), last_time_stamp));
+  assert(!is_smaller24(p->get_seq_num(), last_seq_number));
+  last_seq_number = p->get_seq_num();
+
+  // check if any of the frames processed in other threads are done
+  check_files_in_processing();
+
+  // process newly received packet
+  if (p->get_packet_type() != rtp_packet::PT_BODY)
+  { // main packet payload
+
+    // The existence of a previous frame means we did not get the marked
+    // packet.  Here, we close the frame and move it to processing
+    if (in_use) {
+      ++trunc_frames;
+      send_to_processing();
+    }
+
+    // This is where we process a new frame, if there is space
+    if (avail)
+    {
+      // move from avail to in_use
+      in_use = avail;
+      avail = avail->next;
+      in_use->next = NULL;
+
+      assert(in_use->done.load(std::memory_order_acquire) == 0);
+      in_use->time_stamp = p->get_time_stamp();
+      in_use->last_seen_seq = p->get_seq_num();
+      in_use->frame_idx = total_frames;
+      in_use->f.open();
+      in_use->f.write(p->get_data(), p->get_data_size());
+    }
+    else
+      ++lost_frames;
+
+    ++total_frames;
+    last_time_stamp = p->get_time_stamp();
+  }
+  else 
+  { // body packet payload
+    if (in_use != NULL)
+    {
+      if (p->get_time_stamp() == in_use->time_stamp)
+      { // this is a continuation of a previous frame
+        if (p->get_seq_num() == clip_seq_num(in_use->last_seen_seq + 1))
+        {
+          in_use->last_seen_seq = p->get_seq_num();
+          in_use->f.write(p->get_data(), p->get_data_size());
+          if (p->is_marked())
+            send_to_processing();
+        }
+        else {
+          // we must have missed packets
+          ++trunc_frames;
+          send_to_processing();
+        }
+      }
+      else
+      {
+        // This is a different frame and we did not get the marked packet.
+        // We close the older frame and send it for processing
+        ++trunc_frames;
+        send_to_processing();
+
+        if (is_greater32(p->get_time_stamp(), last_time_stamp))
+        {
+          ++total_frames;
+          last_time_stamp = p->get_time_stamp();
+        }
+      }
+    }
+    else // no frame is being written
+    {
+      if (is_greater32(p->get_time_stamp(), last_time_stamp))
+      {
+        ++total_frames;
+        last_time_stamp = p->get_time_stamp();
+      }
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void frames_handler::get_stats(ui32& total_frames, ui32& trunc_frames, 
+                               ui32& lost_frames)
+{
+  total_frames = this->total_frames;
+  trunc_frames = this->trunc_frames;
+  lost_frames = this->lost_frames;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+bool frames_handler::flush()
+{
+  // check if any of the frames processed in other threads are done
+  check_files_in_processing();
+
+  // check the file in in_use and terminate it
+  if (in_use != NULL)
+  {
+    // move from in_use to avail    
+    in_use->f.close();
+    in_use->next = avail;
+    avail = in_use;
+    in_use = NULL;
+  }
+
+  return (processing != NULL);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void frames_handler::check_files_in_processing()
+{
+  // check if any of the frames processed in other threads are done
+  int nf = num_complete_files.load(std::memory_order_acquire);
+  if (nf > 0)
+  {
+    stex_file* f = processing, *pf = NULL;
+    while(f != NULL && nf > 0)
+    {
+      num_complete_files.fetch_add(-1, std::memory_order_relaxed);
+
+      if (f->done.load(std::memory_order_acquire) == 0)
+      {
+        // move f from processing to avail
+        f->time_stamp = 0;
+        f->last_seen_seq = 0;
+        f->frame_idx = 0;
+        if (f == processing)
+        {
+          processing = processing->next;
+          f->next = avail;
+          avail = f;
+          f = processing;        // for next test
+        }
+        else {
+          pf->next = f->next;
+          f->next = avail;
+          avail = f;
+          f = pf->next;         // for next test
+        }
+      }
+      else 
+      {
+        pf = f;
+        f = f->next;
+      }
+      nf = num_complete_files.load(std::memory_order_acquire);
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void frames_handler::send_to_processing()
+{
+  in_use->f.close();
+  if (target_name) {
+    in_use->next = processing;
+    processing = in_use;
+    in_use->done.store(1, std::memory_order_relaxed);
+    thread_pool->add_task(in_use->storer);
+  }
+  else {
+    in_use->next = avail;
+    avail = in_use;
+  }
+  in_use = NULL;
+}
+
+} // !stex namespace
+} // !ojph namespace
\ No newline at end of file
diff --git a/src/apps/ojph_stream_expand/stream_expand_support.h b/src/apps/ojph_stream_expand/stream_expand_support.h
new file mode 100644
index 00000000..d05ea5e1
--- /dev/null
+++ b/src/apps/ojph_stream_expand/stream_expand_support.h
@@ -0,0 +1,554 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2024, Aous Naman
+// Copyright (c) 2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2024, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: stream_expand_support.cpp
+// Author: Aous Naman
+// Date: 18 April 2024
+//***************************************************************************/
+
+#ifndef OJPH_STR_EX_SUPPORT_H
+#define OJPH_STR_EX_SUPPORT_H
+
+#include <atomic>
+#include <cassert>
+#include "ojph_base.h"
+#include "ojph_file.h"
+#include "ojph_sockets.h"
+
+namespace ojph
+{
+  namespace thds 
+  { class thread_pool; }
+
+namespace stex // stream expand
+{
+
+// defined here
+class packets_handler;
+class frames_handler;
+
+// defined elsewhere
+struct j2k_frame_storer;
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+/*****************************************************************************/
+/** @brief interprets RTP header and payload, and holds received packets.
+ * 
+ *  This object interpret RFC 3550 and draft-ietf-avtcore-rtp-j2k-scl-00.
+ *  The implementation is not complete, but it is sufficient for the time 
+ *  being.
+ *  
+ */
+struct rtp_packet
+{
+  /**
+   *  @brief packet types based on the main header of 
+   *         draft-ietf-avtcore-rtp-j2k-scl-00
+   */
+  enum packet_type : ui32
+  {
+    PT_BODY                  = 0, // this is body packet
+    PT_MAIN_FOLLOWED_BY_MAIN = 1,
+    PT_MAIN_FOLLOWED_BY_BODY = 2, 
+    PT_MAIN                  = 3, // frame has only one main packet
+  };
+public:
+  /**
+   *  @brief default constructor
+   */
+  rtp_packet() { num_bytes = 0; next = NULL; }
+
+  /**
+   *  @brief Call this to link packets.
+   * 
+   *  @param next pointer to next packet
+   */
+  void init(rtp_packet* next) { this->next = next; }
+
+public:
+  // RTP header
+  ui32 get_rtp_version() { return ((ui32)data[0]) >> 6; }
+  bool is_padded() { return (data[0] & 0x20) != 0; }
+  bool is_extended() { return (data[0] & 0x10) != 0; }
+  ui32 get_csrc_count() { return (ui32)(data[0]) & 0xF; }
+  bool is_marked() { return (data[1] & 0x80) != 0; }
+  ui32 get_payload_type() { return (ui32)(data[1]) & 0x7F; }
+  ui32 get_seq_num() {
+    ui32 result = ntohs(*(ui16*)(data + 2));
+    result |= ((ui32)data[15]) << 16;   // extended sequence (ESEQ)
+    return result;
+  }
+  ui32 get_time_stamp() 
+  { return ntohl(*(ui32*)(data + 4)); }
+  ui32 get_ssrc()             // not used for the time being
+  { return ntohl(*(ui32*)(data + 8)); }
+
+  // common in main and body payload headers
+  ui32 get_packet_type() 
+  { return ((ui32)data[12]) >> 6; }
+  ui32 get_TP()
+  { return (((ui32)data[12]) >> 3) & 0x7; }
+  ui32 get_ORDH() { 
+    if (get_packet_type() != PT_BODY) return ((ui32)data[12]) & 0x7; 
+    else return (((ui32)data[13]) >> 7) & 0x1; 
+  }
+  ui32 get_PTSTAMP() {
+    ui32 result = (((ui32)data[13]) & 0xF) << 8;
+    result |= (ui32)data[14];
+    return result; 
+  }
+  ui8* get_data()
+  { return data + 20; }
+  ui32 get_data_size()
+  { return (ui32)num_bytes - 20; }
+
+  // only in main payload header
+  bool is_PTSTAMP_used() { 
+    assert(get_packet_type() != PT_BODY);
+    return (((ui32)data[13]) & 0x80) != 0; 
+  }
+  ui32 get_XTRAC() { 
+    assert(get_packet_type() != PT_BODY);
+    return (((ui32)data[13]) >> 4) & 0x7; 
+  }
+  bool is_codestream_header_reusable() { 
+    assert(get_packet_type() != PT_BODY);
+    return (((ui32)data[16]) & 0x80) != 0;
+  }
+  bool is_component_colorimetry_used() { 
+    assert(get_packet_type() != PT_BODY);    
+    return (((ui32)data[16]) & 0x40) != 0;
+  }
+  bool is_codeblock_caching_used() {
+    assert(get_packet_type() != PT_BODY);
+    return (((ui32)data[16]) & 0x20) != 0;
+  }
+  bool is_RANGE() {
+    assert(get_packet_type() != PT_BODY); 
+    return ((ui32)data[16] & 1) != 0; 
+  }
+  ui32 get_PRIMS(){
+    assert(get_packet_type() != PT_BODY); 
+    return (ui32)data[17]; 
+  }
+  ui32 get_TRANS() { 
+    assert(get_packet_type() != PT_BODY); 
+    return (ui32)data[18]; 
+  }
+  ui32 get_MAT() { 
+    assert(get_packet_type() != PT_BODY); 
+    return (ui32)data[19]; 
+  }
+
+  // only in body payload header
+  ui32 get_RES() { 
+    assert(get_packet_type() == PT_BODY); 
+    return ((ui32)data[12]) & 0x7; 
+  }
+  ui32 get_QUAL() { 
+    assert(get_packet_type() == PT_BODY); 
+    return (((ui32)data[13]) >> 4) & 0x7; 
+  }
+  ui32 get_data_pos() {
+    ui32 result = 0;
+    if (get_packet_type() == PT_BODY) { 
+      result = ((ui32)data[16]) << 4;
+      result |= (((ui32)data[17]) >> 4) & 0xF;
+    }
+    return result;
+  }
+  ui32 get_PID() {
+    assert(get_packet_type() == PT_BODY);     
+    ui32 result = (((ui32)data[17]) & 0xF) << 16;
+    result |= ((ui32)data[18]) << 8;
+    result |= ((ui32)data[19]);
+    return result;
+  }
+  
+
+public:
+  static constexpr int max_size = 2048; //!<maximum packet size
+                                        // ethernet packet are only 1500
+  ui8 data[max_size];                   //!<data in the packet
+  ui32 num_bytes;                       //!<number of bytes 
+  rtp_packet* next;                     //!<used for linking packets
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+/*****************************************************************************/
+/** @brief Interprets new packets, buffers them if needed.
+ * 
+ *  This object primarily attempts to process the RTP packet.
+ *  The main purpose is to buffer received packets if it is not clear where
+ *  they fit. It also drops packets if they become old.
+ * 
+ *  This object basically works as follows.
+ *  The object buffers out-of-order packets, i.e., those with a sequence 
+ *  number higher than expected.  Then, the object tries to push these 
+ *  packets when their sequence number comes.  Packets are pushed to the 
+ *  frames_handler, using the "push" member function.
+ * 
+ *  The buffer has limited size, when it becomes full, the oldest packet is 
+ *  pushed; this basically means that all missing packets are considered
+ *  lost.
+ *  
+ *  When a new packet is pushed, the object looks if it has the next packet
+ *  in its buffer, if so, then it pushes one more packet.  It does not 
+ *  attempt to push more than one packet from its buffer, because this
+ *  might delay picking up the next packet from the operating system network
+ *  stack.
+ * 
+ *  Packets in the buffer are arranged according to their sequence number.
+ *  
+ */
+class packets_handler
+{
+public:
+  /**
+   *  @brief default constructor
+   */
+  packets_handler()
+  {
+    quiet = false;
+    avail = in_use = NULL; 
+    last_seq_num = lost_packets = 0;
+    frames = NULL;
+    num_packets = 0;
+    packet_store = NULL;
+  }
+  /**
+   *  @brief default destructor
+   */
+  ~packets_handler()
+  { if (packet_store) delete[] packet_store; }
+
+public:
+  /**
+   *  @brief call this to initialize packets_handler
+   *
+   *  This function creates a chain of packets that is for packet re-ordering
+   *
+   *  @param quiet no messages are printed when true -- as of this writing
+   *         the object prints no messages
+   *  @param num_packets the number of packets in the chain
+   *  @param frames a pointer to the frames_handler object that will be 
+   *         receive the packets
+   */
+  void init(bool quiet, ui32 num_packets, frames_handler* frames);
+
+  /**
+   *  @brief Call this function to get a packet from the packet chain.
+   *
+   *  This function is an input-output function.  First time call to this 
+   *  function passes a null pointer, and gets a pointer to use. Subsequent
+   *  calls passes the pointer that was obtained earlier to get a new pointer.
+   *  This function supplies one pointer only.
+   *
+   *  @param  p a pointer to a packet that was previously obtained by calling
+   *          this function.
+   *  @return returns a pointer to a packet
+   */
+  rtp_packet* exchange(rtp_packet* p);
+
+  /**
+   *  @brief This function provides information about the observed number 
+   *          of lost packets
+   *
+   *  @return returns number of lost packets up to the time of the call
+   */
+  ui32 get_num_lost_packets() const { return lost_packets; }
+
+  /**
+   *  @brief This function is not used, and therefore it is not clear how to 
+   *         use it.
+   */
+  void flush();
+
+private:
+  /**
+   *  @brief This function sends the packet in in_use (oldest) to frames 
+   *         handler object.
+   * 
+   */
+  void consume_packet();
+
+private:
+  bool quiet;                //!<no informational info is printed when true
+  rtp_packet* avail;         //!<start of available packets chain
+  rtp_packet* in_use;        //!<start of used packet chain
+  ui32 last_seq_num;         //!<the last observed sequence number
+  ui32 lost_packets;         //!<number of lost packets -- just statistics
+  frames_handler* frames;    //!<frames object
+
+  ui32 num_packets;          //!<maximum number of packets in packet_store
+  rtp_packet* packet_store;  //!<address of packet memory allocation
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+/*****************************************************************************/
+/** @brief holds in memory j2k codestream together with other info
+ * 
+ *  This objects holds a j2k codestream file.  The codestream is identified 
+ *  by its timestamp. Once complete the file is pushed to saver.
+ * 
+ *  File chains can be created using the \"next\" member variable.
+ * 
+ *  This object is handled by frames_handler, and therefore, it does not 
+ *  have many functions.  stex_file does not create any objects of its own.
+ * 
+ *  The object also serves to pass information to the j2k_frame_storer, 
+ *  which is run by another thread
+ * 
+ */
+struct stex_file {
+public:
+  /**
+   *  @brief default constructor
+   */
+  stex_file() 
+  { 
+    time_stamp = last_seen_seq = 0; 
+    done.store(0, std::memory_order_relaxed);
+    frame_idx = 0;
+    parent = NULL;
+    name_template = NULL;
+    storer = NULL;
+    next = NULL; 
+  }
+
+public:
+  /**
+   *  @brief call this function to initialize stex_file
+   * 
+   *  It just copies parameters to the object.
+   * 
+   *  @param parent is a pointer to the object holding this file, which is
+   *         frames_handler
+   *  @param next is used to chain files
+   *  @param storer this object is used to store j2k codestreams
+   *  @param name_template file name template to use for storeing files
+   */
+  void init(frames_handler* parent, stex_file* next, j2k_frame_storer *storer,
+            const char *name_template)
+  {
+    this->parent = parent;
+    this->name_template = name_template;
+    this->next = next;
+    this->storer = storer;
+  }
+
+  /**
+   *  @brief other threads can call this function to signal completion of 
+   *         processing.  
+   *
+   *  This function basically reduces \"done\", and when 0 is reached
+   *  the function will let the parent know that there is a stex_file 
+   *  waiting removal.
+   */
+  void notify_file_completion();
+
+public:  
+  ojph::mem_outfile f;    //!<holds in-memory j2k codestream
+  ui32 time_stamp;        //!<time stamp at which this file must be displayed
+  ui32 last_seen_seq;     //!<the last seen RTP sequence number
+  std::atomic_int done;   //!<saving is completed when 0 is reached
+  ui32 frame_idx;         //!<frame number in the sequence
+  frames_handler* parent; //!<the object holding this frame
+
+  const char *name_template; //!<name template for saved files
+  j2k_frame_storer* storer;  //!<stores a j2k frame using another thread
+
+  stex_file* next;        //!<used to create files chain
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+/*****************************************************************************/
+/** @brief 
+ * 
+ *  Assumes packets arrive in order.
+ * 
+ */
+class frames_handler
+{
+public:
+  /**
+   *  @brief default construction
+   */
+  frames_handler()
+  { 
+    quiet = false;
+    num_threads = 0;
+    target_name = NULL;
+    num_files = 0;
+    last_seq_number = last_time_stamp = 0;
+    total_frames = trunc_frames = lost_frames = 0;
+    files_store = in_use = avail = processing = NULL;
+    num_complete_files.store(0);
+    thread_pool = NULL;
+    storers_store = NULL;
+  }
+  /**
+   *  @brief default destructor
+   */  
+  ~frames_handler();
+
+public:
+  /**
+   *  @brief call this function to initialize this object
+   *
+   *  The function just copies collected statistics
+   *
+   *  @param quiet when true, no messages are printed -- as of this writing
+   *         the object prints no messages
+   *  @param target_name a template for the saved file names
+   *  @param thread_pool a thread pool for processing j2k codestreams
+   *         (saving)
+   * 
+   */
+  void init(bool quiet, const char *target_name, 
+            thds::thread_pool* thread_pool);
+
+  /**
+   *  @brief call this function to push rtp_packets to this object
+   *
+   *  Packets received by this object has to be sequentially increasing; 
+   *  older packets are ignored.  That is, a packet with a sequential number 
+   *  smaller than the last observed sequential number is ignored.
+   *
+   *  @param p returns a pointer to the packet.
+   */
+  void push(rtp_packet* p);
+
+  /**
+   *  @brief call this function to collect statistics about frames
+   *
+   *  The function just copies collected statistics
+   *
+   *  @param total_frames returns the number of observed total frames
+   *  @param trunc_frames returns the number of truncated frames
+   *  @param lost_frames returns the number of lost frames -- for which the
+   *                     main header payload packet was not received, but
+   *                     time stamp was observed
+   */
+  void get_stats(ui32& total_frames, ui32& trunc_frames, ui32& lost_frames);
+
+  /**
+   *  @brief This function is not used, and therefore it is not clear how to
+   *         use it.
+   */
+  bool flush();
+
+  /**
+   *  @brief other threads call this function to let frames_handler know that 
+   *         processing is done.
+   *
+   *  This function basically increment the number of objects that need to 
+   *  be moved from processing to avail.
+   *
+   */
+  void increment_num_complete_files()
+  { num_complete_files.fetch_add(1, std::memory_order_release); }
+
+private:
+  /**
+   *  @brief call this function to process stex_file for which processing is
+   *         complete
+   *
+   *  This function moves stex_file from processing to avail if storing 
+   *  is complete.
+   * 
+   */
+  void check_files_in_processing();
+
+  /**
+   *  @brief Handles complete/truncated files and send them for storing
+   *
+   *  This function moves stex_file from in_use to processing if there are
+   *  further processors (such as a storer) or to avail if there are no 
+   *  processors.
+   */
+  void send_to_processing();
+
+private:
+  bool quiet;               //!<no informational info is printed when true
+  ui32 num_threads;         //!<number of threads used for saving
+  const char *target_name;  //!<target file name template
+  ui32 num_files;           //!<maximum number of in-flight files.
+  ui32 last_seq_number;     //!<last observed sequence number
+  ui32 last_time_stamp;     //!<last observed time stamp
+  ui32 total_frames;        //!<total number of frames that were observed
+  ui32 trunc_frames;        //!<truncated frames (because of a packet lostt)
+  ui32 lost_frames;         //!<frames for which main header was not received
+  stex_file* files_store;   //!<address for allocated files
+  stex_file* in_use;        //!<the frame that is being filled with data
+  stex_file* avail;         //!<available frames structures
+  stex_file* processing;    //!<frames that are being saved
+  std::atomic_int32_t 
+    num_complete_files;     //<!num. of files for which processing is complete
+  thds::thread_pool* 
+    thread_pool;            //!<thread pool for processing frames
+  j2k_frame_storer* 
+    storers_store;          //!<address for allocated frame storers
+};
+
+} // !stex namespace
+} // !ojph namespace
+
+#endif //!OJPH_STR_EX_SUPPORT_H
\ No newline at end of file
diff --git a/src/apps/ojph_stream_expand/threaded_frame_processors.cpp b/src/apps/ojph_stream_expand/threaded_frame_processors.cpp
new file mode 100644
index 00000000..ae1aa222
--- /dev/null
+++ b/src/apps/ojph_stream_expand/threaded_frame_processors.cpp
@@ -0,0 +1,64 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2024, Aous Naman
+// Copyright (c) 2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2024, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: threaded_frame_processors.cpp
+// Author: Aous Naman
+// Date: 23 April 2024
+//***************************************************************************/
+
+#include "threaded_frame_processors.h"
+
+namespace ojph
+{
+namespace stex
+{  
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+void j2k_frame_storer::execute()
+{
+  //printf("saving file with index %d\n", file->frame_idx);
+  char buf[128], name[128];
+  snprintf(buf, 128, "%s.j2c", file->name_template);
+  snprintf(name, 128, buf, file->frame_idx);
+  file->f.write_to_file(name);
+  file->notify_file_completion();
+}
+
+} // !stex namespace
+} // !ojph namespace
\ No newline at end of file
diff --git a/src/apps/ojph_stream_expand/threaded_frame_processors.h b/src/apps/ojph_stream_expand/threaded_frame_processors.h
new file mode 100644
index 00000000..c6c3a582
--- /dev/null
+++ b/src/apps/ojph_stream_expand/threaded_frame_processors.h
@@ -0,0 +1,107 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2024, Aous Naman
+// Copyright (c) 2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2024, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: threaded_frame_processors.h
+// Author: Aous Naman
+// Date: 23 April 2024
+//***************************************************************************/
+
+#ifndef THREADED_FRAME_PROCESSOR_H
+#define THREADED_FRAME_PROCESSOR_H
+
+#include "ojph_threads.h"
+#include "stream_expand_support.h"
+
+namespace ojph
+{
+  namespace thds 
+  { class thread_pool; }
+
+namespace stex
+{
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+/*****************************************************************************/
+/** @brief Saves a j2k frame to disk without decoding.
+ * 
+ */
+struct j2k_frame_storer : public thds::worker_thread_base
+{
+public:  
+  /**
+   * @brief default construction
+   */
+  j2k_frame_storer() {
+    file = NULL;
+    name_template = NULL;
+  }
+  /**
+   * @brief default destructor doing nothing
+   */
+  ~j2k_frame_storer() override {}
+
+public:  
+  /**
+   *  @brief call this function to initialize its members
+   * 
+   *  @param file is a stex_file holding the j2k codestream with other
+   *         variables.
+   *  @param name_template holds the a filename template
+   */
+  void init(stex_file* file, const char* name_template)
+  {
+    this->file = file;
+    this->name_template = name_template;
+  }
+
+  /**
+   * @brief A thread from the thread_pool call this function to execute 
+   *        the task
+   */
+  void execute() override;
+
+private:
+  stex_file* file;            //!<a j2k codestream file with other variables
+  const char* name_template;  //!<a template for the target file name
+};
+
+} // !stex namespace
+} // !ojph namespace
+
+#endif // !THREADED_FRAME_PROCESSOR_H
\ No newline at end of file
diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp
index a83e5fc1..a8c356bb 100644
--- a/src/apps/others/ojph_img_io.cpp
+++ b/src/apps/others/ojph_img_io.cpp
@@ -247,7 +247,7 @@ namespace ojph {
     assert(fh == 0);
     fh = fopen(filename, "rb");
     if (fh == 0)
-      OJPH_ERROR(0x030000001, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000001, "Unable to open file %s", filename);
     fname = filename;
 
     // read magic number
@@ -255,28 +255,28 @@ namespace ojph {
     if (fread(t, 1, 2, fh) != 2)
     {
       close();
-      OJPH_ERROR(0x030000002, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000002, "Error reading file %s", filename);
     }
 
     // check magic number
     if (t[0] != 'P' || (t[1] != '5' && t[1] != '6'))
     {
       close();
-      OJPH_ERROR(0x030000003, "unknown file type for file %s", filename);
+      OJPH_ERROR(0x03000003, "unknown file type for file %s", filename);
     }
 
     size_t len = strlen(filename);
     if (t[1] == '5' && strncmp(filename + len - 4, ".pgm", 4) != 0)
     {
       close();
-      OJPH_ERROR(0x030000004, "wrong file extension, a file with "
+      OJPH_ERROR(0x03000004, "wrong file extension, a file with "
         "keyword P5 must have a .pgm extension for file %s", filename);
     }
     if (t[1] == '6' && strncmp(filename + len - 4, ".ppm", 4) != 0)
     {
       close();
-      OJPH_ERROR(0x030000005, "wrong file extension, a file with keyword P6 "
-        "must have a .ppm extension fir file %s", filename);
+      OJPH_ERROR(0x03000005, "wrong file extension, a file with keyword P6 "
+        "must have a .ppm extension for file %s", filename);
     }
 
     // set number of components based on file-type
@@ -287,7 +287,7 @@ namespace ojph {
     if (fscanf(fh, "%d %d %d", &width, &height, &max_val) != 3)
     {
       close();
-      OJPH_ERROR(0x030000006, "error in file format for file %s", filename);
+      OJPH_ERROR(0x03000006, "error in file format for file %s", filename);
     }
     num_ele_per_line = num_comps * width;
     bytes_per_sample = max_val > 255 ? 2 : 1;
@@ -309,7 +309,7 @@ namespace ojph {
           temp_buf = malloc(temp_buf_byte_size);
         if (temp_buf == NULL) { // failed to allocate memory
           if (t) free(t); // the original buffer is still valid
-          OJPH_ERROR(0x030000007, "error allocating mmeory");
+          OJPH_ERROR(0x03000007, "error allocating memory");
         }
       }
       else
@@ -329,9 +329,9 @@ namespace ojph {
       return;
       
     if (bytes_per_sample == 1)
-      temp_buf = alloc_p->post_alloc_data<ui8>(num_comps * width, 0);
+      temp_buf = alloc_p->post_alloc_data<ui8>(num_comps * (size_t)width, 0);
     else
-      temp_buf = alloc_p->post_alloc_data<ui16>(num_comps * width, 0);
+      temp_buf = alloc_p->post_alloc_data<ui16>(num_comps * (size_t)width, 0);
   }
 
   /////////////////////////////////////////////////////////////////////////////
@@ -347,7 +347,7 @@ namespace ojph {
       if (result != num_ele_per_line)
       {
         close();
-        OJPH_ERROR(0x030000011, "not enough data in file %s", fname);
+        OJPH_ERROR(0x03000011, "not enough data in file %s", fname);
       }
       if (++cur_line >= height)
       {
@@ -394,21 +394,21 @@ namespace ojph {
         if (strncmp(".ppm", filename + len - 4, 4) == 0)
         {
           filename[len - 2] = 'g'; 
-          OJPH_WARN(0x03000001, "file was renamed %s\n", filename);
+          OJPH_WARN(0x03000021, "file was renamed %s\n", filename);
         }
         if (strncmp(".PPM", filename + len - 4, 4) == 0)
         {
           filename[len - 2] = 'G';
-          OJPH_WARN(0x03000002, "file was renamed %s\n", filename);
+          OJPH_WARN(0x03000022, "file was renamed %s\n", filename);
         }
       }
       fh = fopen(filename, "wb");
       if (fh == NULL)
-        OJPH_ERROR(0x030000021,
+        OJPH_ERROR(0x03000023,
           "unable to open file %s for writing", filename);
 
       fprintf(fh, "P5\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1);
-      buffer_size = width * bytes_per_sample;
+      buffer_size = (size_t)width * bytes_per_sample;
       buffer = (ui8*)malloc(buffer_size);
     }
     else
@@ -419,23 +419,23 @@ namespace ojph {
         if (strncmp(".pgm", filename + len - 4, 4) == 0)
         {
           filename[len - 2] = 'p';
-          OJPH_WARN(0x03000003, "file was renamed %s\n", filename);
+          OJPH_WARN(0x03000024, "file was renamed %s\n", filename);
         }
         if (strncmp(".PGM", filename + len - 4, 4) == 0)
         {
           filename[len - 2] = 'P';
-          OJPH_WARN(0x03000004, "file was renamed %s\n", filename);
+          OJPH_WARN(0x03000025, "file was renamed %s\n", filename);
         }
       }
       fh = fopen(filename, "wb");
       if (fh == NULL)
-        OJPH_ERROR(0x030000022,
+        OJPH_ERROR(0x03000026,
           "unable to open file %s for writing", filename);
       int result = //the number of written characters
         fprintf(fh, "P6\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1);
       if (result == 0)
-        OJPH_ERROR(0x030000023, "error writing to file %s", filename);
-      buffer_size = width * num_components * bytes_per_sample;
+        OJPH_ERROR(0x03000027, "error writing to file %s", filename);
+      buffer_size = (size_t)width * num_components * (size_t)bytes_per_sample;
       buffer = (ui8*)malloc(buffer_size);
     }
     fname = filename;
@@ -448,7 +448,7 @@ namespace ojph {
   {
     assert(fh == NULL); //configure before opening
     if (num_components != 1 && num_components != 3)
-      OJPH_ERROR(0x030000031,
+      OJPH_ERROR(0x03000031,
         "ppm supports 3 colour components, while pgm supports 1");
     this->width = width;
     this->height = height;
@@ -458,6 +458,8 @@ namespace ojph {
     samples_per_line = num_components * width;
     bytes_per_line = bytes_per_sample * samples_per_line;
     
+#if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
+
     if (bytes_per_sample == 1) {
       if (num_components == 1) 
         converter = gen_cvrt_32b1c_to_8ub1c;
@@ -471,39 +473,66 @@ namespace ojph {
         converter = gen_cvrt_32b3c_to_16ub3c_be;
     }
 
-#ifndef OJPH_DISABLE_INTEL_SIMD
+  #ifndef OJPH_DISABLE_SIMD
 
-    if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE41) {
-      if (bytes_per_sample == 1) {
-        if (num_components == 1) 
-          converter = sse41_cvrt_32b1c_to_8ub1c;
-        else
-          converter = sse41_cvrt_32b3c_to_8ub3c;
-      }
-      else {
-        if (num_components == 1) 
-          converter = sse41_cvrt_32b1c_to_16ub1c_be;
-        else
-          converter = sse41_cvrt_32b3c_to_16ub3c_be;
-      }
-    }
+    #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
 
-    if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) {
-      if (bytes_per_sample == 1) {
-        if (num_components == 1) 
-          converter = avx2_cvrt_32b1c_to_8ub1c;
-        else
-          converter = avx2_cvrt_32b3c_to_8ub3c;
-      }
-      else {
-        if (num_components == 1) 
-          converter = avx2_cvrt_32b1c_to_16ub1c_be;
-        else
-          { } // did not find an implementation better than sse41
-      }
-    }
+      #ifndef OJPH_DISABLE_SSE4
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE41) {
+          if (bytes_per_sample == 1) {
+            if (num_components == 1) 
+              converter = sse41_cvrt_32b1c_to_8ub1c;
+            else
+              converter = sse41_cvrt_32b3c_to_8ub3c;
+          }
+          else {
+            if (num_components == 1) 
+              converter = sse41_cvrt_32b1c_to_16ub1c_be;
+            else
+              converter = sse41_cvrt_32b3c_to_16ub3c_be;
+          }
+        }
+      #endif // !OJPH_DISABLE_SSE4
+
+      #ifndef OJPH_DISABLE_AVX2
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) {
+          if (bytes_per_sample == 1) {
+            if (num_components == 1) 
+              converter = avx2_cvrt_32b1c_to_8ub1c;
+            else
+              converter = avx2_cvrt_32b3c_to_8ub3c;
+          }
+          else {
+            if (num_components == 1) 
+              converter = avx2_cvrt_32b1c_to_16ub1c_be;
+            else
+              { } // did not find an implementation better than sse41
+          }
+        }
+      #endif // !OJPH_DISABLE_AVX2
+
+    #elif defined(OJPH_ARCH_ARM)
+
+    #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
+
+  #endif // !OJPH_DISABLE_SIMD
 
-#endif
+#else // OJPH_ENABLE_WASM_SIMD
+
+    if (bytes_per_sample == 1) {
+      if (num_components == 1) 
+        converter = sse41_cvrt_32b1c_to_8ub1c;
+      else
+        converter = sse41_cvrt_32b3c_to_8ub3c;
+    }
+    else {
+      if (num_components == 1) 
+        converter = sse41_cvrt_32b1c_to_16ub1c_be;
+      else
+        converter = sse41_cvrt_32b3c_to_16ub3c_be;
+    }
+  
+#endif // !OJPH_ENABLE_WASM_SIMD
   }
 
   ////////////////////////////////////////////////////////////////////////////
@@ -520,12 +549,257 @@ namespace ojph {
       size_t result = fwrite(buffer,
                               bytes_per_sample, samples_per_line, fh);
       if (result != samples_per_line)
-        OJPH_ERROR(0x030000042, "error writing to file %s", fname);
+        OJPH_ERROR(0x03000041, "error writing to file %s", fname);
     }
     return 0;
   }
 
   ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+
+  /////////////////////////////////////////////////////////////////////////////
+  void pfm_in::open(const char *filename)
+  {
+    assert(fh == 0);
+    fh = fopen(filename, "rb");
+    if (fh == 0)
+      OJPH_ERROR(0x03000051, "Unable to open file %s", filename);
+    fname = filename;
+
+    // read magic number
+    char t[2];
+    if (fread(t, 1, 2, fh) != 2)
+    {
+      close();
+      OJPH_ERROR(0x03000052, "Error reading file %s", filename);
+    }
+
+    // check magic number
+    if (t[0] != 'P' || (t[1] != 'F' && t[1] != 'f'))
+    {
+      close();
+      OJPH_ERROR(0x03000053, "Unknown file type for file %s", filename);
+    }
+
+    // set number of components based on file-type
+    num_comps = t[1] == 'f' ? 1 : 3;
+    eat_white_spaces(fh);
+
+    // read width, height and max value in header
+    if (fscanf(fh, "%d %d", &width, &height) != 2)
+    {
+      close();
+      OJPH_ERROR(0x03000054, 
+        "Error reading width and height in file %s", filename);
+    }
+    eat_white_spaces(fh);
+
+    // little or big-endian
+    if (fscanf(fh, "%f", &scale) != 1)
+    {
+      close();
+      OJPH_ERROR(0x03000055, "Error reading scale in file %s", filename);
+    }
+    little_endian = scale < 0.0f;
+    scale = std::abs(scale);
+
+    fgetc(fh);
+    start_of_data = ojph_ftell(fh);
+
+    // alloc. linebuffer to hold a line of image data, if more than 1 comp.
+    if (temp_buf_byte_size < num_comps * (size_t)width * sizeof(float))
+    {
+      if (alloc_p == NULL)
+      {
+        temp_buf_byte_size = num_comps * (size_t)width * sizeof(float);
+        void* t = temp_buf;
+        if (temp_buf)
+          temp_buf = (float*)realloc(temp_buf, temp_buf_byte_size);
+        else
+          temp_buf = (float*)malloc(temp_buf_byte_size);
+        if (temp_buf == NULL) { // failed to allocate memory
+          if (t) free(t); // the original buffer is still valid
+          OJPH_ERROR(0x03000056, "Error allocating memory");
+        }
+      }
+      else
+      {
+        assert(temp_buf_byte_size == 0); //cannot reallocate the buffer
+        temp_buf_byte_size = num_comps * (size_t)width * sizeof(float);
+        alloc_p->pre_alloc_data<float>(temp_buf_byte_size, 0);
+      }
+    }
+    cur_line = 0;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  void pfm_in::finalize_alloc()
+  {
+    if (alloc_p == NULL)
+      return;
+    temp_buf = alloc_p->post_alloc_data<float>(num_comps * (size_t)width, 0);
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  ui32 pfm_in::read(const line_buf* line, ui32 comp_num)
+  {
+    assert(temp_buf_byte_size != 0 );
+    assert(fh != 0 && comp_num < num_comps);
+    assert(line->size >= width);
+
+    if (comp_num == 0)
+    {
+      si64 loc = start_of_data;
+      loc += (size_t)(height-1 - cur_line) * (size_t)num_comps 
+           * (size_t)width * sizeof(float);
+      if (ojph_fseek(fh, loc, SEEK_SET) != 0)
+      {
+        close();
+        OJPH_ERROR(0x03000061, "Error seeking in file %s", fname);
+      }
+      size_t result = 
+        fread(temp_buf, sizeof(float), (size_t)num_comps * (size_t)width, fh);
+      if (result != (size_t)num_comps * (size_t)width)
+      {
+        close();
+        OJPH_ERROR(0x03000062, "Not enough data in file %s", fname);
+      }
+      if (++cur_line >= height)
+        cur_line = 0;
+    }
+
+    union {
+      si32* s;
+      ui32* u;
+      float* f;
+    } sp, dp;
+
+    if (little_endian)
+    {
+      ui32 shift = 32 - bit_depth[comp_num];
+      sp.f = temp_buf + comp_num;
+      dp.f = line->f32;
+      if (shift)
+        for (ui32 i = width; i > 0; --i, sp.f += num_comps) 
+        {
+          si32 s = *sp.s;
+          s >>= shift;
+          *dp.s++ = s;
+        }
+      else
+        for (ui32 i = width; i > 0; --i, sp.f += num_comps)
+          *dp.f++ = *sp.f;
+    }
+    else {
+      ui32 shift = 32 - bit_depth[comp_num];
+      sp.f = temp_buf + comp_num;
+      dp.f = line->f32;
+      if (shift)
+        for (ui32 i = width; i > 0; --i, sp.f += num_comps) {
+          ui32 u = be2le(*sp.u);
+          si32 s = *(si32*)&u;
+          s >>= shift;
+          *dp.s++ = s;
+        }
+      else
+        for (ui32 i = width; i > 0; --i, sp.f += num_comps)
+          *dp.u++ = be2le(*sp.u);
+    }
+
+    return width;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////////////////////////////////////////////////////////
+  void pfm_out::open(char* filename)
+  {
+    assert(fh == NULL && buffer == NULL);
+    fh = fopen(filename, "wb");
+    if (fh == NULL)
+      OJPH_ERROR(0x03000071,
+        "Unable to open file %s for writing", filename);
+    int result = //the number of written characters
+      fprintf(fh, "P%c\n%d %d\n%f\n", 
+        num_components > 1 ? 'F' : 'f', width, height, scale);
+    if (result == 0)
+      OJPH_ERROR(0x03000072, "error writing to file %s", filename);
+    buffer_size = (size_t)width * num_components * sizeof(float);
+    buffer = (float*)malloc(buffer_size);
+    fname = filename;
+    cur_line = 0;
+    start_of_data = ojph_ftell(fh);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  void pfm_out::configure(ui32 width, ui32 height, ui32 num_components, 
+                          float scale, ui32* bit_depth)
+  {
+    assert(fh == NULL); //configure before opening
+    if (num_components != 1 && num_components != 3)
+      OJPH_ERROR(0x03000081,
+        "pfm supports 1 or 3 colour components, not %d", num_components);
+    this->width = width;
+    this->height = height;
+    this->num_components = num_components;
+    this->scale = scale < 0.0f ? scale : -scale;
+    for (ui32 c = 0; c < num_components; ++c)
+      this->bit_depth[c] = bit_depth[c];
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  ui32 pfm_out::write(const line_buf* line, ui32 comp_num)
+  {
+    assert(fh);
+
+    ui32 shift = 32 - bit_depth[comp_num];
+    union {
+      ui32* u;
+      float* f;
+    } sp, dp;
+
+    dp.f = buffer + comp_num;
+    sp.f = line->f32;
+
+    if (shift)
+      for (ui32 i = width; i > 0; --i, dp.f += num_components, ++sp.f)
+      {
+        ui32 u = *sp.u;
+        u <<= shift;
+        *dp.u = u;
+      }
+    else
+      for (ui32 i = width; i > 0; --i, dp.f += num_components)
+        *dp.f = *sp.f++;
+
+    if (comp_num == num_components - 1)
+    {
+      size_t samples_per_line = num_components * (size_t)width;
+      si64 loc = start_of_data;
+      loc += (height - 1 - cur_line)* samples_per_line * sizeof(float);
+      if (ojph_fseek(fh, loc, SEEK_SET) != 0)
+        OJPH_ERROR(0x03000082, "Error seeking in file %s", fname);
+      size_t result = fwrite(buffer, sizeof(float), samples_per_line, fh);
+      if (result != samples_per_line)
+        OJPH_ERROR(0x03000083, "error writing to file %s", fname);
+      ++cur_line;
+    }
+
+    return 0;
+  }
+
+ ////////////////////////////////////////////////////////////////////////////
  //
  //
  //
@@ -538,7 +812,7 @@ namespace ojph {
   {
     tiff_handle = NULL;
     if ((tiff_handle = TIFFOpen(filename, "r")) == NULL)
-      OJPH_ERROR(0x0300000B1, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000091, "Unable to open file %s", filename);
     fname = filename;
 
     ui32 tiff_width = 0;
@@ -578,7 +852,7 @@ namespace ojph {
     // allocate linebuffer to hold a line of image data
     line_buffer = malloc(bytes_per_line);
     if (NULL == line_buffer)
-      OJPH_ERROR(0x0300000B2, "Unable to allocate %d bytes for line_buffer[] "
+      OJPH_ERROR(0x03000092, "Unable to allocate %d bytes for line_buffer[] "
         "for file %s", bytes_per_line, filename);
       
     cur_line = 0;
@@ -586,7 +860,7 @@ namespace ojph {
     // Error on known incompatilbe input formats
     if( tiff_bits_per_sample != 8 && tiff_bits_per_sample != 16 )
     {
-      OJPH_ERROR(0x0300000B3, "\nTIFF IO is currently limited to file limited"
+      OJPH_ERROR(0x03000093, "\nTIFF IO is currently limited"
         " to files with TIFFTAG_BITSPERSAMPLE=8 and TIFFTAG_BITSPERSAMPLE=16 \n"
         "input file = %s has TIFFTAG_BITSPERSAMPLE=%d", 
         filename, tiff_bits_per_sample);
@@ -594,14 +868,14 @@ namespace ojph {
 
     if( TIFFIsTiled( tiff_handle ) )
     {
-      OJPH_ERROR(0x0300000B4, "\nTIFF IO is currently limited to TIF files "
+      OJPH_ERROR(0x03000094, "\nTIFF IO is currently limited to TIF files "
         "without tiles. \nInput file %s has been detected as tiled", filename);
     }
 
     if(PHOTOMETRIC_RGB != tiff_photometric && 
        PHOTOMETRIC_MINISBLACK != tiff_photometric )
     {
-      OJPH_ERROR(0x0300000B5, "\nTIFF IO is currently limited to "
+      OJPH_ERROR(0x03000095, "\nTIFF IO is currently limited to "
         "TIFFTAG_PHOTOMETRIC=PHOTOMETRIC_MINISBLACK=%d and "
         "PHOTOMETRIC_RGB=%d. \nInput file %s has been detected "
         "TIFFTAG_PHOTOMETRIC=%d", 
@@ -610,7 +884,7 @@ namespace ojph {
 
     if( tiff_samples_per_pixel > 4 )
     {
-      OJPH_ERROR(0x0300000B6, "\nTIFF IO is currently limited to "
+      OJPH_ERROR(0x03000096, "\nTIFF IO is currently limited to "
         "TIFFTAG_SAMPLESPERPIXEL=4 \nInput file %s has been detected with "
         "TIFFTAG_SAMPLESPERPIXEL=%d",
         filename, tiff_samples_per_pixel);
@@ -632,7 +906,7 @@ namespace ojph {
       line_buffer_for_planar_support_uint8 = 
         (uint8_t*)calloc(width, sizeof(uint8_t));
       if (NULL == line_buffer_for_planar_support_uint8)
-        OJPH_ERROR(0x0300000B7, "Unable to allocate %d bytes for "
+        OJPH_ERROR(0x03000097, "Unable to allocate %d bytes for "
           "line_buffer_for_planar_support_uint8[] for file %s", 
           width * sizeof(uint8_t), filename);
     }
@@ -642,7 +916,7 @@ namespace ojph {
       line_buffer_for_planar_support_uint16 = 
         (uint16_t*)calloc(width, sizeof(uint16_t));
       if (NULL == line_buffer_for_planar_support_uint16)
-        OJPH_ERROR(0x0300000B8, "Unable to allocate %d bytes for "
+        OJPH_ERROR(0x03000098, "Unable to allocate %d bytes for "
           "line_buffer_for_planar_support_uint16[] for file %s", 
           width * sizeof(uint16_t), filename);
     }
@@ -654,7 +928,7 @@ namespace ojph {
   void tif_in::set_bit_depth(ui32 num_bit_depths, ui32* bit_depth)
   {
     if (num_bit_depths < 1)
-      OJPH_ERROR(0x030000B9, "one or more bit_depths must be provided");
+      OJPH_ERROR(0x030000A1, "one or more bit_depths must be provided");
     ui32 last_bd_idx = 0;
     for (ui32 i = 0; i < 4; ++i)
     {
@@ -663,7 +937,7 @@ namespace ojph {
 
       if (bd > 32 || bd < 1)
       {
-        OJPH_ERROR(0x0300000BA, 
+        OJPH_ERROR(0x030000A2, 
           "bit_depth = %d, this must be an integer from 1-32", bd);
       }
       this->bit_depth[i] = bd;
@@ -680,12 +954,12 @@ namespace ojph {
     // the first time trying to access this line
     if (PLANARCONFIG_SEPARATE == planar_configuration && 0 == comp_num )
     {
-      for (unsigned short color = 0; color < num_comps; color++)
+      for (ui32 color = 0; color < num_comps; color++)
       {
         if (bytes_per_sample == 1)
         {
           TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint8, 
-            cur_line, color);
+            cur_line, (ui16)color);
           ui32 x = color;
           uint8_t* line_buffer_of_interleaved_components = 
             (uint8_t*)line_buffer;
@@ -698,7 +972,7 @@ namespace ojph {
         else if (bytes_per_sample == 2)
         {
           TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint16, 
-            cur_line, color);
+            cur_line, (ui16)color);
           ui32 x = color;
           ui16* line_buffer_of_interleaved_components = (ui16*)line_buffer;
           for (ui32 i = 0; i < width; i++, x += num_comps)
@@ -799,23 +1073,23 @@ namespace ojph {
     }
     if (max_bitdepth > 16)
     {
-      OJPH_WARN(0x0300000C2, "TIFF output is currently limited to files "
+      OJPH_WARN(0x030000B1, "TIFF output is currently limited to files "
         "with max_bitdepth = 16, the source codestream has max_bitdepth=%d"
         ", the decoded data will be truncated to 16 bits", max_bitdepth);
     }
     if (num_components > 4)
     {
-      OJPH_ERROR(0x0300000C3, "TIFF IO is currently limited to files with "
+      OJPH_ERROR(0x030000B2, "TIFF IO is currently limited to files with "
         "num_components=1 to 4");
     }
 
     assert(tiff_handle == NULL && buffer == NULL);
     if ((tiff_handle = TIFFOpen(filename, "w")) == NULL)
     {
-      OJPH_ERROR(0x0300000C1, "unable to open file %s for writing", filename);
+      OJPH_ERROR(0x030000B3, "unable to open file %s for writing", filename);
     }
 
-    buffer_size = width * num_components * bytes_per_sample;
+    buffer_size = width * (size_t)num_components * (size_t)bytes_per_sample;
     buffer = (ui8*)malloc(buffer_size);
     fname = filename;
     cur_line = 0;
@@ -891,7 +1165,7 @@ namespace ojph {
       bytes_per_sample = 2;
     }
     samples_per_line = num_components * width;
-    bytes_per_line = bytes_per_sample * samples_per_line;
+    bytes_per_line = bytes_per_sample * (size_t)samples_per_line;
 
   }
 
@@ -1004,7 +1278,7 @@ namespace ojph {
       {
         int result = TIFFWriteScanline(tiff_handle, buffer, cur_line++);
         if (result != 1)
-          OJPH_ERROR(0x0300000C4, "error writing to file %s", fname);
+          OJPH_ERROR(0x030000C1, "error writing to file %s", fname);
       }
     return 0;
   }
@@ -1024,7 +1298,7 @@ namespace ojph {
     assert(fh == NULL);
     fh = fopen(filename, "rb");
     if (fh == 0)
-      OJPH_ERROR(0x03000051, "Unable to open file %s", filename);
+      OJPH_ERROR(0x030000D1, "Unable to open file %s", filename);
 
     //need to extract info from filename
 
@@ -1052,7 +1326,7 @@ namespace ojph {
     if (result != width[comp_num])
     {
       close();
-      OJPH_ERROR(0x03000061, "not enough data in file %s", fname);
+      OJPH_ERROR(0x030000E1, "not enough data in file %s", fname);
     }
 
     if (bytes_per_sample[comp_num] == 1)
@@ -1078,11 +1352,11 @@ namespace ojph {
                              ui32 num_downsamplings, const point *subsampling)
   {
     if (num_components != 1 && num_components !=3)
-      OJPH_ERROR(0x03000071, "yuv_in support 1 or 3 components");
+      OJPH_ERROR(0x030000F1, "yuv_in support 1 or 3 components");
     this->num_com = num_components;
 
     if (num_downsamplings < 1)
-      OJPH_ERROR(0x03000072, "one or more downsampling must be provided");
+      OJPH_ERROR(0x030000F2, "one or more downsampling must be provided");
 
     ui32 last_downsamp_idx = 0;
     for (ui32 i = 0; i < num_components; ++i)
@@ -1104,7 +1378,7 @@ namespace ojph {
   void yuv_in::set_bit_depth(ui32 num_bit_depths, ui32* bit_depth)
   {
     if (num_bit_depths < 1)
-      OJPH_ERROR(0x03000081, "one or more bit_depths must be provided");
+      OJPH_ERROR(0x03000101, "one or more bit_depths must be provided");
     ui32 last_bd_idx = 0;
     for (ui32 i = 0; i < 3; ++i)
     {
@@ -1146,7 +1420,7 @@ namespace ojph {
     assert(fh == NULL); //configure before open
     fh = fopen(filename, "wb");
     if (fh == 0)
-      OJPH_ERROR(0x03000091, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000111, "Unable to open file %s", filename);
     fname = filename;
   }
 
@@ -1189,7 +1463,7 @@ namespace ojph {
         *dp++ = (ui16)val;
       }
       if (fwrite(buffer, 2, w, fh) != w)
-        OJPH_ERROR(0x030000A1, "unable to write to file %s", fname);
+        OJPH_ERROR(0x03000121, "unable to write to file %s", fname);
     }
     else
     {
@@ -1203,7 +1477,7 @@ namespace ojph {
         *dp++ = (ui8)val;
       }
       if (fwrite(buffer, 1, w, fh) != w)
-        OJPH_ERROR(0x030000A2, "unable to write to file %s", fname);
+        OJPH_ERROR(0x03000122, "unable to write to file %s", fname);
     }
 
     return w;
@@ -1223,11 +1497,11 @@ namespace ojph {
     assert(fh == NULL);
     fh = fopen(filename, "rb");
     if (fh == NULL)
-      OJPH_ERROR(0x030000C1, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000131, "Unable to open file %s", filename);
 
     cur_line = 0;
     bytes_per_sample = (bit_depth + 7) >> 3;
-    buffer_size = width * bytes_per_sample;
+    buffer_size = (size_t)width * bytes_per_sample;
     buffer = (ui8*)malloc(buffer_size);
     fname = filename;
   }
@@ -1241,7 +1515,7 @@ namespace ojph {
     if (result != width)
     {
       close();
-      OJPH_ERROR(0x030000C2, "not enough data in file %s", fname);
+      OJPH_ERROR(0x03000132, "not enough data in file %s", fname);
     }
 
     if (bytes_per_sample > 3)
@@ -1350,7 +1624,7 @@ namespace ojph {
     assert(fh == NULL); //configure before open
     fh = fopen(filename, "wb");
     if (fh == 0)
-      OJPH_ERROR(0x03000091, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000141, "Unable to open file %s", filename);
     fname = filename;
   }
 
@@ -1363,11 +1637,11 @@ namespace ojph {
     this->width = width;
 
     if (is_signed) { 
-      upper_val = (1 << (bit_depth - 1));
-      lower_val = -(1 << (bit_depth - 1));
+      upper_val = ((si64)1 << (bit_depth - 1));
+      lower_val = -((si64)1 << (bit_depth - 1));
     } else {
-      upper_val = 1 << bit_depth;
-      lower_val = 0;
+      upper_val = (si64)1 << bit_depth;
+      lower_val = (si64)0;
     }
 
     bytes_per_sample = (bit_depth + 7) >> 3;
@@ -1382,63 +1656,127 @@ namespace ojph {
     assert(fh);
     assert(comp_num == 0);
 
-    if (bytes_per_sample > 3)
+    if (is_signed) 
     {
-      const si32* sp = line->i32;
-      ui32* dp = (ui32*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      if (bytes_per_sample > 3)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp++ = (ui32)val;
+        const si32* sp = line->i32;
+        si32* dp = (si32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (si32)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000151, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x030000B1, "unable to write to file %s", fname);
-    }
-    else if (bytes_per_sample > 2)
-    {
-      const si32* sp = line->i32;
-      ui32* dp = (ui32*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      else if (bytes_per_sample > 2)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp = (ui32)val;
-        // this only works for little endian architecture
-        dp = (ui32*)((ui8*)dp + 3);
+        const si32* sp = line->i32;
+        si32* dp = (si32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp = (si32)val;
+          // this only works for little endian architecture
+          dp = (si32*)((ui8*)dp + 3);
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000152, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x030000B2, "unable to write to file %s", fname);
-    }
-    else if (bytes_per_sample > 1)
-    {
-      const si32* sp = line->i32;
-      ui16* dp = (ui16*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      else if (bytes_per_sample > 1)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp++ = (ui16)val;
+        const si32* sp = line->i32;
+        si16* dp = (si16*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (si16)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000153, "unable to write to file %s", fname);
+      }
+      else
+      {
+        const si32* sp = line->i32;
+        si8* dp = (si8*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (si8)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000154, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x030000B3, "unable to write to file %s", fname);
     }
-    else
+    else 
     {
-      const si32* sp = line->i32;
-      ui8* dp = (ui8*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      if (bytes_per_sample > 3)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp++ = (ui8)val;
+        const ui32* sp = (ui32*)line->i32;
+        ui32* dp = (ui32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (ui32)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000155, "unable to write to file %s", fname);
+      }
+      else if (bytes_per_sample > 2)
+      {
+        const ui32* sp = (ui32*)line->i32;
+        ui32* dp = (ui32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp = (ui32)val;
+          // this only works for little endian architecture
+          dp = (ui32*)((ui8*)dp + 3);
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000156, "unable to write to file %s", fname);
+      }
+      else if (bytes_per_sample > 1)
+      {
+        const ui32* sp = (ui32*)line->i32;
+        ui16* dp = (ui16*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (ui16)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000157, "unable to write to file %s", fname);
+      }
+      else
+      {
+        const ui32* sp = (ui32*)line->i32;
+        ui8* dp = (ui8*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (ui8)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000158, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x030000B4, "unable to write to file %s", fname);
     }
 
     return width;
@@ -1460,7 +1798,7 @@ namespace ojph {
     assert(file_handle == 0);
     file_handle = fopen(filename, "rb");
     if (0 == file_handle)
-      OJPH_ERROR(0x0300000D1, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000161, "Unable to open file %s", filename);
     fname = filename;
 
     // read magic number
@@ -1468,7 +1806,7 @@ namespace ojph {
     if (fread(&magic_number, sizeof(ui32), 1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000D2, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000162, "Error reading file %s", filename);
     }
 
     // check magic number
@@ -1487,7 +1825,7 @@ namespace ojph {
     else
     {
       close();
-      OJPH_ERROR(0x0300000D3, "Error reading file %s - this does not appear "
+      OJPH_ERROR(0x03000163, "Error reading file %s - this does not appear "
         "to be a valid DPX file.  It has magic number = 0x%08X.  The magic "
         "number of a DPX file is 0x%08X.", filename, magic_number, 
         dpx_magic_number);
@@ -1498,7 +1836,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000D4, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000164, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       offset_to_image_data_in_bytes = be2le(offset_to_image_data_in_bytes);
@@ -1506,14 +1844,14 @@ namespace ojph {
     if (fread(version, sizeof(uint8_t), 8, file_handle) != 8)
     {
       close();
-      OJPH_ERROR(0x0300000D5, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000165, "Error reading file %s", filename);
     }
     // read image file size in bytes
     if (fread(&total_image_file_size_in_bytes, sizeof(ui32), 1, file_handle) 
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000D6, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000166, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       total_image_file_size_in_bytes = be2le(total_image_file_size_in_bytes);
@@ -1522,14 +1860,14 @@ namespace ojph {
     if (fseek(file_handle,768, SEEK_SET) != 0)
     {
       close();
-      OJPH_ERROR(0x0300000D7, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000167, "Error reading file %s", filename);
     }
 
     // read image_orientation
     if (fread(&image_orientation, sizeof(uint16_t), 1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000D8, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000168, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       image_orientation = be2le(image_orientation);
@@ -1539,7 +1877,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000D9, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000169, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       number_of_image_elements = be2le(number_of_image_elements);
@@ -1548,7 +1886,7 @@ namespace ojph {
     if (fread(&pixels_per_line, sizeof(ui32), 1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000DA, "Error reading file %s", filename);
+      OJPH_ERROR(0x0300016A, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       pixels_per_line = be2le(pixels_per_line);
@@ -1557,7 +1895,7 @@ namespace ojph {
     if (fread(&lines_per_image_element, sizeof(ui32), 1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000DB, "Error reading file %s", filename);
+      OJPH_ERROR(0x0300016B, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       lines_per_image_element = be2le(lines_per_image_element);
@@ -1566,7 +1904,7 @@ namespace ojph {
     if (fseek(file_handle, 780, SEEK_SET) != 0)
     {
       close();
-      OJPH_ERROR(0x0300000DC, "Error reading file %s", filename);
+      OJPH_ERROR(0x0300016C, "Error reading file %s", filename);
     }
 
     // read data sign for image element
@@ -1574,7 +1912,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000DE, "Error reading file %s", filename);
+      OJPH_ERROR(0x0300016E, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       data_sign_for_image_element_1 = be2le(data_sign_for_image_element_1);
@@ -1583,7 +1921,7 @@ namespace ojph {
     if (fseek(file_handle, 800, SEEK_SET) != 0)
     {
       close();
-      OJPH_ERROR(0x0300000DF, "Error reading file %s", filename);
+      OJPH_ERROR(0x0300016F, "Error reading file %s", filename);
     }
 
     // read descriptor
@@ -1591,7 +1929,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E0, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000170, "Error reading file %s", filename);
     }
 
     // read transfer characteristic
@@ -1599,7 +1937,7 @@ namespace ojph {
               1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E1, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000171, "Error reading file %s", filename);
     }
 
     // read colorimetric specification
@@ -1607,7 +1945,7 @@ namespace ojph {
         1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E2, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000172, "Error reading file %s", filename);
     }
 
     // read bit depth
@@ -1615,7 +1953,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E3, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000173, "Error reading file %s", filename);
     }
 
     // read packing
@@ -1623,7 +1961,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E4, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000174, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       packing_for_image_element_1 = be2le(packing_for_image_element_1);
@@ -1633,7 +1971,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E5, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000175, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       encoding_for_image_element_1 = be2le(encoding_for_image_element_1);
@@ -1643,17 +1981,17 @@ namespace ojph {
               file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E6, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000176, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       offset_to_data_for_image_element_1 = 
         be2le(offset_to_data_for_image_element_1);
 
     // set to starting point of image data
-    if (fseek(file_handle, offset_to_image_data_in_bytes, SEEK_SET) != 0)
+    if (fseek(file_handle, (long)offset_to_image_data_in_bytes, SEEK_SET) != 0)
     {
       close();
-      OJPH_ERROR(0x0300000E7, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000177, "Error reading file %s", filename);
     }
 
     // set ojph properties
@@ -1679,17 +2017,17 @@ namespace ojph {
     // allocate linebuffer to hold a line of image data from the file
     line_buffer = malloc(number_of_32_bit_words_per_line * sizeof(ui32) );
     if (NULL == line_buffer)
-      OJPH_ERROR(0x0300000E8, "Unable to allocate %d bytes for line_buffer[] "
+      OJPH_ERROR(0x03000178, "Unable to allocate %d bytes for line_buffer[] "
         "for file %s", 
         number_of_32_bit_words_per_line * sizeof(ui32), filename);
 
     // allocate line_buffer_16bit_samples to hold a line of image data in memory
     line_buffer_16bit_samples = 
-      (ui16*) malloc(width * num_comps * sizeof(ui16));
+      (ui16*) malloc((size_t)width * num_comps * sizeof(ui16));
     if (NULL == line_buffer_16bit_samples)
-      OJPH_ERROR(0x0300000E9, "Unable to allocate %d bytes for "
+      OJPH_ERROR(0x03000179, "Unable to allocate %d bytes for "
         "line_buffer_16bit_samples[] for file %s", 
-        width * num_comps * sizeof(ui16), filename);
+        (size_t)width * num_comps * sizeof(ui16), filename);
 
     cur_line = 0;
 
@@ -1709,7 +2047,7 @@ namespace ojph {
           file_handle) != number_of_32_bit_words_per_line)
       {
         close();
-        OJPH_ERROR(0x0300000F1, "Error reading file %s", fname);
+        OJPH_ERROR(0x03000181, "Error reading file %s", fname);
       }
 
       if (true == is_byte_swapping_necessary)
@@ -1763,7 +2101,7 @@ namespace ojph {
       }
       else
       {
-        OJPH_ERROR(0x0300000F2, "file %s uses DPX image formats that are not "
+        OJPH_ERROR(0x03000182, "file %s uses DPX image formats that are not "
           "yet supported by this software\n bitdepth_for_image_element_1 = "
           "%d\n num_comps=%d\npacking_for_image_element_1=%d\n "
           "descriptor_for_image_element_1=%d", fname, 
diff --git a/src/apps/others/ojph_img_io_avx2.cpp b/src/apps/others/ojph_img_io_avx2.cpp
index 69e3080e..30ba5a21 100644
--- a/src/apps/others/ojph_img_io_avx2.cpp
+++ b/src/apps/others/ojph_img_io_avx2.cpp
@@ -35,6 +35,8 @@
 // Date: 23 May 2022
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
 
 #include <cstdlib>
 #include <cstring>
@@ -212,7 +214,14 @@ namespace ojph {
       _mm_storeu_si128((__m128i*)(p + 48), _mm256_castsi256_si128(v));
       _mm_storeu_si128((__m128i*)(p + 60), _mm256_extracti128_si256(v,1));
       _mm_storeu_si128((__m128i*)(p + 72), _mm256_castsi256_si128(w));
+#ifdef OJPH_ARCH_X86_64      
       *((si64*)(p + 84)) = _mm256_extract_epi64(w, 2);
+#elif (defined OJPH_ARCH_I386)
+      *((si32*)(p + 84)) = _mm256_extract_epi32(w, 4);
+      *((si32*)(p + 88)) = _mm256_extract_epi32(w, 5);
+#else
+      #error Error unsupport compiler
+#endif
       *((si32*)(p + 92)) = _mm256_extract_epi32(w, 6);
 
       // this is an alterative slower implementation
@@ -345,3 +354,5 @@ namespace ojph {
     }    
   }
 }
+
+#endif
diff --git a/src/apps/others/ojph_img_io_sse41.cpp b/src/apps/others/ojph_img_io_sse41.cpp
index 04541905..26ee9949 100644
--- a/src/apps/others/ojph_img_io_sse41.cpp
+++ b/src/apps/others/ojph_img_io_sse41.cpp
@@ -35,6 +35,10 @@
 // Date: 23 May 2022
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) \
+  || defined(OJPH_ARCH_X86_64) \
+  || defined(OJPH_ENABLE_WASM_SIMD)
 
 #include <cstdlib>
 #include <cstring>
@@ -505,3 +509,5 @@ namespace ojph {
     }
   }
 }
+
+#endif
diff --git a/src/apps/others/ojph_sockets.cpp b/src/apps/others/ojph_sockets.cpp
new file mode 100644
index 00000000..43b21057
--- /dev/null
+++ b/src/apps/others/ojph_sockets.cpp
@@ -0,0 +1,202 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2024, Aous Naman
+// Copyright (c) 2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2024, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_socket.cpp
+// Author: Aous Naman
+// Date: 17 April 2024
+//***************************************************************************/
+
+#include <cassert>
+#include <string.h>
+#include "ojph_message.h"
+#include "ojph_sockets.h"
+
+namespace ojph
+{
+  namespace net 
+  {
+
+    ///////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    ///////////////////////////////////////////////////////////////////////////
+
+    ///////////////////////////////////////////////////////////////////////////
+    socket::socket(const ojph_socket& s)
+    {
+      this->s = s;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    void socket::close()
+    {
+
+      if (s != OJPH_INVALID_SOCKET)
+      {
+      #ifdef OJPH_OS_WINDOWS
+        ::closesocket(s);
+      #else
+        ::close(s);
+      #endif
+        s = OJPH_INVALID_SOCKET;      
+      }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    bool socket::set_blocking_mode(bool block)
+    {
+    #ifdef OJPH_OS_WINDOWS
+      u_long mode = block ? 0 : 1;
+      return ioctlsocket(s, FIONBIO, &mode) == 0;
+    #else
+      int flags = fcntl(s, F_GETFL);
+      if (flags == -1) // error
+        return false;
+      if (block)
+        flags &= ~O_NONBLOCK;
+      else
+        flags |= O_NONBLOCK;
+      return fcntl(s, F_SETFL, flags) != -1;
+    #endif  
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    ///////////////////////////////////////////////////////////////////////////
+
+    ///////////////////////////////////////////////////////////////////////////
+    int socket_manager::ojph_socket_manager_counter = 0;
+
+    ///////////////////////////////////////////////////////////////////////////
+    socket_manager::socket_manager()
+    {
+      if (ojph_socket_manager_counter == 0)
+      {
+    #ifdef OJPH_OS_WINDOWS
+      WSADATA wsa;
+      if (WSAStartup(MAKEWORD(2,2), &wsa) != 0)
+      {
+        std::string err = get_last_error_message();
+        OJPH_ERROR(0x00080001, "Could not create socket : %s\n", err.data());
+      }
+    #endif
+      }
+      ++ojph_socket_manager_counter;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    socket_manager::~socket_manager()
+    {
+      assert(ojph_socket_manager_counter >= 1);
+      --ojph_socket_manager_counter;
+      if (ojph_socket_manager_counter == 0)
+      {
+      #ifdef OJPH_OS_WINDOWS  
+      	WSACleanup();
+      #endif
+      }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    socket socket_manager::create_socket(int domain, int type, int protocol)
+    {
+      socket s(::socket(domain, type, protocol));
+      return s;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    int socket_manager::get_last_error()
+    {
+    #ifdef OJPH_OS_WINDOWS
+      return WSAGetLastError();
+    #else
+      return errno;
+    #endif
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    std::string socket_manager::get_error_message(int errnum)
+    {
+      if( errnum == 0 )
+        return std::string("");
+      const int max_buf_size = 1024;
+      char buf[max_buf_size]; 
+      char *v = buf;
+    #ifdef OJPH_OS_WINDOWS
+      size_t size = FormatMessage( FORMAT_MESSAGE_FROM_SYSTEM
+                                  | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                  NULL, errnum, 
+                                  MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), 
+                                  buf, max_buf_size, NULL);
+      buf[max_buf_size - 1] = 0;
+    #elif (defined __GLIBC__) && \
+      ((defined _GNU_SOURCE) || (_POSIX_C_SOURCE < 200112L))
+      v = strerror_r(errnum, (char*)buf, max_buf_size);
+    #else
+      // it is not clear if the returned value is in buf or in v
+      int t = strerror_r(errnum, (char*)buf, max_buf_size);
+      if (t != 0)
+        OJPH_ERROR(0x00080002, "Error retrieving a text message for "
+          "socket error number %d\n", errnum);
+      buf[max_buf_size - 1] = 0;
+    #endif
+      std::string str;
+      str = v;    
+      return str;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    std::string socket_manager::get_last_error_message()
+    {
+      int errnum = get_last_error();
+      return get_error_message(errnum);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    ui32 socket_manager::get_addr(const sockaddr_in& addr)
+    {
+    #ifdef OJPH_OS_WINDOWS
+      return addr.sin_addr.S_un.S_addr;
+    #else
+      return addr.sin_addr.s_addr;
+    #endif
+    }
+
+  } // !net namespace 
+} // !ojph namespace 
diff --git a/src/apps/others/ojph_threads.cpp b/src/apps/others/ojph_threads.cpp
new file mode 100644
index 00000000..a3268b21
--- /dev/null
+++ b/src/apps/others/ojph_threads.cpp
@@ -0,0 +1,108 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2024, Aous Naman
+// Copyright (c) 2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2024, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_threads.h
+// Author: Aous Naman
+// Date: 22 April 2024
+//***************************************************************************/
+
+#include "ojph_threads.h"
+
+namespace ojph
+{
+namespace thds
+{
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//
+//
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+thread_pool::~thread_pool()
+{
+  stop.store(true, std::memory_order_release);
+  condition.notify_all();
+  for (size_t i = 0; i < threads.size(); ++i)
+    threads[i].join();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void thread_pool::init(size_t num_threads)
+{
+  if (threads.size() < num_threads)
+    threads.resize(num_threads);
+
+  for (size_t i = 0; i < num_threads; ++i) 
+    threads[i] = std::thread(start_thread, this);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void thread_pool::add_task(worker_thread_base* task)
+{
+  mutex.lock();
+  tasks.push_back(task);
+  condition.notify_one();
+  mutex.unlock();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void thread_pool::start_thread(thread_pool* tp)
+{
+  while (1)
+  {
+    // setup the condition variable
+    std::unique_lock<std::mutex> lock(tp->mutex);
+    // wait releases the mutex, blocks until notified (or spuriously), 
+    // and acquire the mutex
+    tp->condition.wait(lock);
+  
+    if(tp->stop.load(std::memory_order_acquire))
+      return;
+    
+    worker_thread_base* task = NULL;
+    if (!tp->tasks.empty())
+    {
+      task = tp->tasks.front();
+      tp->tasks.pop_front();
+    }
+    lock.unlock();
+    if (task)
+      task->execute();
+  }
+}
+
+} // !thds namespace 
+} // !ojph namespace
\ No newline at end of file
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
new file mode 100644
index 00000000..0a13d3b2
--- /dev/null
+++ b/src/core/CMakeLists.txt
@@ -0,0 +1,180 @@
+
+file(GLOB CODESTREAM       "codestream/*.cpp" "codestream/*.h")
+file(GLOB CODESTREAM_SSE   "codestream/*_sse.cpp")
+file(GLOB CODESTREAM_SSE2  "codestream/*_sse2.cpp")
+file(GLOB CODESTREAM_AVX   "codestream/*_avx.cpp")
+file(GLOB CODESTREAM_AVX2  "codestream/*_avx2.cpp")
+file(GLOB CODESTREAM_WASM  "codestream/*_wasm.cpp")
+file(GLOB CODING           "coding/*.cpp" "coding/*.h")
+file(GLOB CODING_SSSE3     "coding/*_ssse3.cpp")
+file(GLOB CODING_WASM      "coding/*_wasm.cpp")
+file(GLOB CODING_AVX2      "coding/*_avx2.cpp")
+file(GLOB CODING_AVX512    "coding/*_avx512.cpp")
+file(GLOB COMMON           "openjph/*.h")
+file(GLOB OTHERS           "others/*.cpp" "others/*.c")
+file(GLOB TRANSFORM        "transform/*.cpp" "transform/*.h")
+file(GLOB TRANSFORM_SSE    "transform/*_sse.cpp")
+file(GLOB TRANSFORM_SSE2   "transform/*_sse2.cpp")
+file(GLOB TRANSFORM_AVX    "transform/*_avx.cpp")
+file(GLOB TRANSFORM_AVX2   "transform/*_avx2.cpp")
+file(GLOB TRANSFORM_AVX512 "transform/*_avx512.cpp")
+file(GLOB TRANSFORM_WASM   "transform/*_wasm.cpp")
+
+list(REMOVE_ITEM CODESTREAM ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODESTREAM_WASM})
+list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX2} ${CODING_AVX512})
+list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512} ${TRANSFORM_WASM})
+list(APPEND SOURCES ${CODESTREAM} ${CODING} ${COMMON} ${OTHERS} ${TRANSFORM})
+
+source_group("codestream"        FILES ${CODESTREAM})
+source_group("coding"            FILES ${CODING})
+source_group("common"            FILES ${COMMON})
+source_group("others"            FILES ${OTHERS})
+source_group("transform"         FILES ${TRANSFORM})
+
+if(EMSCRIPTEN)
+  if (OJPH_ENABLE_WASM_SIMD)
+    list(APPEND SOURCES ${CODESTREAM_WASM} ${CODING_WASM} ${TRANSFORM_WASM})
+    source_group("codestream" FILES ${CODESTREAM_WASM})
+    source_group("coding" FILES ${CODING_WASM})
+    source_group("transform" FILES ${TRANSFORM_WASM})
+  endif()
+else()
+  if (NOT OJPH_DISABLE_SIMD)
+    if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64")
+      OR ("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_I386")
+      OR MULTI_GEN_X86_64)
+
+      if (NOT OJPH_DISABLE_SSE)
+        list(APPEND SOURCES ${CODESTREAM_SSE} ${TRANSFORM_SSE})
+        source_group("codestream" FILES ${CODESTREAM_SSE})
+        source_group("transform" FILES ${TRANSFORM_SSE})
+      endif()
+      if (NOT OJPH_DISABLE_SSE2)
+        list(APPEND SOURCES ${CODESTREAM_SSE2} ${TRANSFORM_SSE2})
+        source_group("codestream" FILES ${CODESTREAM_SSE2})
+        source_group("transform" FILES ${TRANSFORM_SSE2})
+      endif()
+      if (NOT OJPH_DISABLE_SSSE3)
+        list(APPEND SOURCES ${CODING_SSSE3})
+        source_group("coding" FILES ${CODING_SSSE3})
+      endif()
+      if (NOT OJPH_DISABLE_AVX)
+        list(APPEND SOURCES ${CODESTREAM_AVX} ${TRANSFORM_AVX})
+        source_group("codestream" FILES ${CODESTREAM_AVX})
+        source_group("transform" FILES ${TRANSFORM_AVX})
+      endif()
+      if (NOT OJPH_DISABLE_AVX2)
+        list(APPEND SOURCES ${CODESTREAM_AVX2} ${TRANSFORM_AVX2} ${CODING_AVX2})
+        source_group("codestream" FILES ${CODESTREAM_AVX2})
+        source_group("transform" FILES ${TRANSFORM_AVX2})
+        source_group("coding" FILES ${CODING_AVX2})
+      endif()
+      if (NOT OJPH_DISABLE_AVX512)
+        list(APPEND SOURCES ${CODING_AVX512} ${TRANSFORM_AVX512})
+        source_group("coding" FILES ${CODING_AVX512})
+        source_group("transform" FILES ${TRANSFORM_AVX512})
+      endif()
+
+      # Set compilation flags
+      if (MSVC)
+        set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX")
+        set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+        set_source_files_properties(coding/ojph_block_decoder_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+        set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+        set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512")
+        set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX")
+        set_source_files_properties(transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+        set_source_files_properties(transform/ojph_transform_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX")
+        set_source_files_properties(transform/ojph_transform_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+        set_source_files_properties(transform/ojph_transform_avx512.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512")
+      else()
+        set_source_files_properties(codestream/ojph_codestream_sse.cpp PROPERTIES COMPILE_FLAGS -msse)
+        set_source_files_properties(codestream/ojph_codestream_sse2.cpp PROPERTIES COMPILE_FLAGS -msse2)
+        set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
+        set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+        set_source_files_properties(coding/ojph_block_decoder_ssse3.cpp PROPERTIES COMPILE_FLAGS -mssse3)
+        set_source_files_properties(coding/ojph_block_decoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+        set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+        set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512cd")
+        set_source_files_properties(transform/ojph_colour_sse.cpp PROPERTIES COMPILE_FLAGS -msse)
+        set_source_files_properties(transform/ojph_colour_sse2.cpp PROPERTIES COMPILE_FLAGS -msse2)
+        set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
+        set_source_files_properties(transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+        set_source_files_properties(transform/ojph_transform_sse.cpp PROPERTIES COMPILE_FLAGS -msse)
+        set_source_files_properties(transform/ojph_transform_sse2.cpp PROPERTIES COMPILE_FLAGS -msse2)
+        set_source_files_properties(transform/ojph_transform_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
+        set_source_files_properties(transform/ojph_transform_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+        set_source_files_properties(transform/ojph_transform_avx512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512cd")
+      endif()
+    endif()
+
+    if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_ARM") OR MULTI_GEN_ARM64)
+
+    endif()
+
+  endif()
+
+endif()
+
+## Set debug postfix for different platforms
+if (MSVC)
+  if (NOT DEFINED CMAKE_DEBUG_POSTFIX)
+    set(CMAKE_DEBUG_POSTFIX "d")
+  endif()
+elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+  if (NOT DEFINED CMAKE_DEBUG_POSTFIX)
+    set(CMAKE_DEBUG_POSTFIX "_d")
+  endif()
+endif()
+
+add_library(openjph ${SOURCES})
+
+## The option BUILD_SHARED_LIBS
+if (BUILD_SHARED_LIBS AND WIN32)
+  target_compile_definitions(openjph PRIVATE OJPH_BUILD_SHARED_LIBRARY)
+endif()
+
+## include library version/name
+set_target_properties(openjph PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_compile_definitions(openjph PUBLIC _FILE_OFFSET_BITS=64)
+target_include_directories(openjph PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/openjph> $<INSTALL_INTERFACE:include>)
+
+## This is to check if aligned_alloc or posix_memalign is available
+# We want the code to compile for C11 and C++11.
+# std::aligned_alloc is only availabe in C++17.
+# So here we try to see which API is available and adapt the code to use it
+if (NOT MSVC)
+  include(CheckSymbolExists)
+  check_symbol_exists(aligned_alloc "stdlib.h" OJPH_ALIGNED_ALLOC_EXISTS)
+  if (OJPH_ALIGNED_ALLOC_EXISTS)
+    target_compile_definitions(openjph PRIVATE OJPH_ALIGNED_ALLOC_EXISTS)
+  else()
+    check_symbol_exists(posix_memalign "stdlib.h" OJPH_POSIX_MEMALIGN_EXISTS)
+    if (OJPH_POSIX_MEMALIGN_EXISTS)
+      target_compile_definitions(openjph PRIVATE OJPH_POSIX_MEMALIGN_EXISTS)
+    endif()
+  endif()
+endif()
+
+if (MSVC)
+  set(OJPH_LIB_NAME_STRING "openjph.${OPENJPH_VERSION_MAJOR}.${OPENJPH_VERSION_MINOR}")
+  set_target_properties(openjph
+    PROPERTIES
+      OUTPUT_NAME "openjph.${OPENJPH_VERSION_MAJOR}.${OPENJPH_VERSION_MINOR}")
+else()
+  set(OJPH_LIB_NAME_STRING "openjph.${OPENJPH_VERSION_MAJOR}.${OPENJPH_VERSION_MINOR}")
+  set_target_properties(openjph
+    PROPERTIES
+      SOVERSION "${OPENJPH_VERSION_MAJOR}.${OPENJPH_VERSION_MINOR}"
+      VERSION "${OPENJPH_VERSION}")
+endif()
+
+install(TARGETS openjph
+  EXPORT openjph-targets
+)
+
+install(DIRECTORY openjph/
+  DESTINATION include/openjph
+  FILES_MATCHING
+  PATTERN "*.h"
+)
diff --git a/src/core/codestream/ojph_bitbuffer_write.h b/src/core/codestream/ojph_bitbuffer_write.h
index d5b6bcac..ecb9dd20 100644
--- a/src/core/codestream/ojph_bitbuffer_write.h
+++ b/src/core/codestream/ojph_bitbuffer_write.h
@@ -109,33 +109,25 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void bb_put_zeros(bit_write_buf *bbp, int num_zeros,
+                      mem_elastic_allocator *elastic,
+                      coded_lists*& cur_coded_list, ui32& ph_bytes)
+    {
+      for (int i = num_zeros; i > 0; --i)
+        bb_put_bit(bbp, 0, elastic, cur_coded_list, ph_bytes);
+    }
+
     //////////////////////////////////////////////////////////////////////////
     static inline
     void bb_put_bits(bit_write_buf *bbp, ui32 data, int num_bits,
                      mem_elastic_allocator *elastic,
                      coded_lists*& cur_coded_list, ui32& ph_bytes)
     {
-//      assert(num_bits <= 32);
-      for (int i = num_bits - 1; i >= 0; --i)
+      assert(num_bits <= 32);
+      for (int i = num_bits - 1; i >= 0; --i) 
         bb_put_bit(bbp, data >> i, elastic, cur_coded_list, ph_bytes);
-//      while (num_bits) {
-//        int tx_bits = num_bits < bbp->avail_bits ? num_bits : bbp->avail_bits;
-//        bbp->tmp |= (data >> (num_bits - tx_bits)) & ((1 << tx_bits) - 1);
-//        bbp->avail_bits -= tx_bits;
-//        if (bbp->avail_bits <= 0)
-//        {
-//          bbp->avail_bits = 8 - (bbp->tmp != 0xFF ? 0 : 1);
-//          bbp->buf[bbp->buf_size - bbp->avail_size] = (ui8)(bbp->tmp & 0xFF);
-//          bbp->tmp = 0;
-//          --bbp->avail_size;
-//          if (bbp->avail_size == 0)
-//          {
-//            bb_expand_buf(bbp, elastic, cur_coded_list->next_list);
-//            cur_coded_list = cur_coded_list->next_list;
-//            ph_bytes += bit_buffer::needed;
-//          }
-//        }
-//      }
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp
index a95cbef5..88e20eba 100644
--- a/src/core/codestream/ojph_codeblock.cpp
+++ b/src/core/codestream/ojph_codeblock.cpp
@@ -45,6 +45,7 @@
 #include "ojph_codestream_local.h"
 #include "ojph_codeblock.h"
 #include "ojph_subband.h"
+#include "ojph_resolution.h"
 
 namespace ojph {
 
@@ -52,15 +53,19 @@ namespace ojph {
   {
 
     //////////////////////////////////////////////////////////////////////////
-    void codeblock::pre_alloc(codestream *codestream,
-                              const size& nominal)
+    void codeblock::pre_alloc(codestream *codestream, const size& nominal, 
+                              ui32 precision)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
 
       assert(byte_alignment / sizeof(ui32) > 1);
       const ui32 f = byte_alignment / sizeof(ui32) - 1;
       ui32 stride = (nominal.w + f) & ~f; // a multiple of 8
-      allocator->pre_alloc_data<ui32>(nominal.h * stride, 0);
+      
+      if (precision <= 32)
+        allocator->pre_alloc_data<ui32>(nominal.h * (size_t)stride, 0);
+      else
+        allocator->pre_alloc_data<ui64>(nominal.h * (size_t)stride, 0);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -68,14 +73,23 @@ namespace ojph {
                                    subband *parent, const size& nominal,
                                    const size& cb_size,
                                    coded_cb_header* coded_cb,
-                                   ui32 K_max, int line_offset)
+                                   ui32 K_max, int line_offset,
+                                   ui32 precision, ui32 comp_idx)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
 
       const ui32 f = byte_alignment / sizeof(ui32) - 1;
       this->stride = (nominal.w + f) & ~f; // a multiple of 8
       this->buf_size = this->stride * nominal.h;
-      this->buf = allocator->post_alloc_data<ui32>(this->buf_size, 0);
+
+      if (precision <= 32) {
+        this->precision = BUF32;
+        this->buf32 = allocator->post_alloc_data<ui32>(this->buf_size, 0);
+      }
+      else {
+        this->precision = BUF64;
+        this->buf64 = allocator->post_alloc_data<ui64>(this->buf_size, 0);
+      }
 
       this->nominal_size = nominal;
       this->cb_size = cb_size;
@@ -85,12 +99,12 @@ namespace ojph {
       this->delta = parent->get_delta();
       this->delta_inv = 1.0f / this->delta;
       this->K_max = K_max;
-      for (int i = 0; i < 8; ++i)
-        this->max_val[i] = 0;
-      ojph::param_cod cod = codestream->access_cod();
-      this->reversible = cod.is_reversible();
+      for (int i = 0; i < 4; ++i)
+        this->max_val64[i] = 0;
+      const param_cod* coc = codestream->get_coc(comp_idx);
+      this->reversible = coc->is_reversible();
       this->resilient = codestream->is_resilient();
-      this->stripe_causal = cod.get_block_vertical_causality();
+      this->stripe_causal = coc->get_block_vertical_causality();
       this->zero_block = false;
       this->coded_cb = coded_cb;
 
@@ -100,28 +114,61 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void codeblock::push(line_buf *line)
     {
-      // convert to sign and magnitude and keep max_val
-      const si32 *sp = line->i32 + line_offset;
-      ui32 *dp = buf + cur_line * stride;
-      this->codeblock_functions.tx_to_cb(sp, dp, K_max, delta_inv, cb_size.w, 
-        max_val);
-      ++cur_line;
+      // convert to sign and magnitude and keep max_val      
+      if (precision == BUF32)
+      {
+        assert(line->flags & line_buf::LFT_32BIT);
+        const si32 *sp = line->i32 + line_offset;
+        ui32 *dp = buf32 + cur_line * stride;
+        this->codeblock_functions.tx_to_cb32(sp, dp, K_max, delta_inv, 
+                                             cb_size.w, max_val32);
+        ++cur_line;
+      }
+      else 
+      {
+        assert(precision == BUF64);
+        assert(line->flags & line_buf::LFT_64BIT);
+        const si64 *sp = line->i64 + line_offset;
+        ui64 *dp = buf64 + cur_line * stride;
+        this->codeblock_functions.tx_to_cb64(sp, dp, K_max, delta_inv, 
+                                             cb_size.w, max_val64);
+        ++cur_line;
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
     void codeblock::encode(mem_elastic_allocator *elastic)
     {
-      ui32 mv = this->codeblock_functions.find_max_val(max_val);
-      if (mv >= 1u<<(31 - K_max))
+      if (precision == BUF32)
       {
-        coded_cb->missing_msbs = K_max - 1;
-        assert(coded_cb->missing_msbs > 0);
-        assert(coded_cb->missing_msbs < K_max);
-        coded_cb->num_passes = 1;
-        
-        this->codeblock_functions.encode_cb(buf, K_max-1, 1,
-          cb_size.w, cb_size.h, stride, coded_cb->pass_length,
-          elastic, coded_cb->next_coded);
+        ui32 mv = this->codeblock_functions.find_max_val32(max_val32);
+        if (mv >= 1u << (31 - K_max))
+        {
+          coded_cb->missing_msbs = K_max - 1;
+          assert(coded_cb->missing_msbs > 0);
+          assert(coded_cb->missing_msbs < K_max);
+          coded_cb->num_passes = 1;
+          
+          this->codeblock_functions.encode_cb32(buf32, K_max-1, 1,
+            cb_size.w, cb_size.h, stride, coded_cb->pass_length,
+            elastic, coded_cb->next_coded);
+        }
+      }
+      else
+      {
+        assert(precision == BUF64);
+        ui64 mv = this->codeblock_functions.find_max_val64(max_val64);
+        if (mv >= 1ULL << (63 - K_max))
+        {
+          coded_cb->missing_msbs = K_max - 1;
+          assert(coded_cb->missing_msbs > 0);
+          assert(coded_cb->missing_msbs < K_max);
+          coded_cb->num_passes = 1;
+          
+          this->codeblock_functions.encode_cb64(buf64, K_max-1, 1,
+            cb_size.w, cb_size.h, stride, coded_cb->pass_length,
+            elastic, coded_cb->next_coded);
+        }
       }
     }
 
@@ -132,8 +179,8 @@ namespace ojph {
       this->cb_size = cb_size;
       this->coded_cb = coded_cb;
       this->cur_line = 0;
-      for (int i = 0; i < 8; ++i)
-        this->max_val[i] = 0;
+      for (int i = 0; i < 4; ++i)
+        this->max_val64[i] = 0;
       this->zero_block = false;
     }
 
@@ -143,19 +190,34 @@ namespace ojph {
       if (coded_cb->pass_length[0] > 0 && coded_cb->num_passes > 0 &&
           coded_cb->next_coded != NULL)
       {
-        bool result = this->codeblock_functions.decode_cb(
+        bool result;
+        if (precision == BUF32)
+        {
+          result = this->codeblock_functions.decode_cb32(
             coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size,
-            buf, coded_cb->missing_msbs, coded_cb->num_passes,
+            buf32, coded_cb->missing_msbs, coded_cb->num_passes,
             coded_cb->pass_length[0], coded_cb->pass_length[1],
             cb_size.w, cb_size.h, stride, stripe_causal);
+        }
+        else 
+        {
+          assert(precision == BUF64);
+          result = this->codeblock_functions.decode_cb64(
+            coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size,
+            buf64, coded_cb->missing_msbs, coded_cb->num_passes,
+            coded_cb->pass_length[0], coded_cb->pass_length[1],
+            cb_size.w, cb_size.h, stride, stripe_causal);
+        }
 
         if (result == false)
-          {
-            if (resilient == true)
-              zero_block = true;
-            else
-              OJPH_ERROR(0x000300A1, "Error decoding a codeblock\n");
+        {
+          if (resilient == true) {
+            OJPH_INFO(0x000300A1, "Error decoding a codeblock.");
+            zero_block = true;
           }
+          else
+            OJPH_ERROR(0x000300A1, "Error decoding a codeblock.");
+        }
       }
       else
         zero_block = true;
@@ -165,15 +227,35 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void codeblock::pull_line(line_buf *line)
     {
-      si32 *dp = line->i32 + line_offset;
-      if (!zero_block)
+      //convert to sign and magnitude
+      if (precision == BUF32)
       {
-        //convert to sign and magnitude
-        const ui32 *sp = buf + cur_line * stride;
-        this->codeblock_functions.tx_from_cb(sp, dp, K_max, delta, cb_size.w);
+        assert(line->flags & line_buf::LFT_32BIT);
+        si32 *dp = line->i32 + line_offset;
+        if (!zero_block)
+        {
+          const ui32 *sp = buf32 + cur_line * stride;
+          this->codeblock_functions.tx_from_cb32(sp, dp, K_max, delta, 
+                                                 cb_size.w);
+        }
+        else
+          this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(ui32));
       }
       else
-        this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp));
+      {
+        assert(precision == BUF64);
+        assert(line->flags & line_buf::LFT_64BIT);
+        si64 *dp = line->i64 + line_offset;
+        if (!zero_block)
+        {
+          const ui64 *sp = buf64 + cur_line * stride;
+          this->codeblock_functions.tx_from_cb64(sp, dp, K_max, delta, 
+                                                 cb_size.w);
+        }
+        else
+          this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp));
+      }
+
       ++cur_line;
       assert(cur_line <= cb_size.h);
     }
diff --git a/src/core/codestream/ojph_codeblock.h b/src/core/codestream/ojph_codeblock.h
index 2f7d8e78..4d6fbe4c 100644
--- a/src/core/codestream/ojph_codeblock.h
+++ b/src/core/codestream/ojph_codeblock.h
@@ -3,21 +3,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -48,7 +48,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_elastic_allocator;
   class codestream;
   struct coded_lists;
@@ -65,12 +65,18 @@ namespace ojph {
     class codeblock
     {
       friend struct precinct;
+      enum : ui32 {
+        BUF32 = 4,
+        BUF64 = 8,
+      };
+
     public:
-      static void pre_alloc(codestream *codestream, const size& nominal);
+      static void pre_alloc(codestream *codestream, const size& nominal,
+                            ui32 precision);
       void finalize_alloc(codestream *codestream, subband* parent,
                           const size& nominal, const size& cb_size,
-                          coded_cb_header* coded_cb,
-                          ui32 K_max, int tbx0);
+                          coded_cb_header* coded_cb, ui32 K_max,
+                          int tbx0, ui32 precision, ui32 comp_idx);
       void push(line_buf *line);
       void encode(mem_elastic_allocator *elastic);
       void recreate(const size& cb_size, coded_cb_header* coded_cb);
@@ -79,7 +85,11 @@ namespace ojph {
       void pull_line(line_buf *line);
 
     private:
-      ui32* buf;
+      ui32 precision;
+      union {
+        ui32* buf32;
+        ui64* buf64;
+      };
       size nominal_size;
       size cb_size;
       ui32 stride;
@@ -93,7 +103,10 @@ namespace ojph {
       bool resilient;
       bool stripe_causal;
       bool zero_block; // true when the decoded block is all zero
-      ui32 max_val[8]; // supports up to 256 bits
+      union {
+        ui32 max_val32[8]; // supports up to 256 bits
+        ui64 max_val64[4]; // supports up to 256 bits
+      };
       coded_cb_header* coded_cb;
       codeblock_fun codeblock_functions;
     };
@@ -102,7 +115,7 @@ namespace ojph {
     struct coded_cb_header
     {
       ui32 pass_length[2];
-      ui32 num_passes;
+      ui32 num_passes;       // number of passes to be decoded
       ui32 Kmax;
       ui32 missing_msbs;
       coded_lists *next_coded;
diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp
index d938192f..cad2434a 100644
--- a/src/core/codestream/ojph_codeblock_fun.cpp
+++ b/src/core/codestream/ojph_codeblock_fun.cpp
@@ -63,126 +63,221 @@ namespace ojph {
     void wasm_mem_clear(void* addr, size_t count);
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 gen_find_max_val(ui32* address);
-    ui32 sse2_find_max_val(ui32* address);
-    ui32 avx2_find_max_val(ui32* address);
-    ui32 wasm_find_max_val(ui32* address);
+    ui32  gen_find_max_val32(ui32* address);
+    ui32 sse2_find_max_val32(ui32* address);
+    ui32 avx2_find_max_val32(ui32* address);
+    ui32 wasm_find_max_val32(ui32* address);
+    ui64  gen_find_max_val64(ui64* address);
+    ui64 sse2_find_max_val64(ui64* address);
+    ui64 avx2_find_max_val64(ui64* address);
+    ui64 wasm_find_max_val64(ui64* address);
+
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
+    void  gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void  gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+
+    void  gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
+    void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
+    void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
+    void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
+    void  gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void  gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
 
+    void  gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);                               
 
     void codeblock_fun::init(bool reversible) {
 
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
       // Default path, no acceleration.  We may change this later
-      decode_cb = ojph_decode_codeblock;
-      find_max_val = gen_find_max_val;
+      decode_cb32 = ojph_decode_codeblock32;
+      find_max_val32 = gen_find_max_val32;
       mem_clear = gen_mem_clear;
       if (reversible) {
-        tx_to_cb = gen_rev_tx_to_cb;
-        tx_from_cb = gen_rev_tx_from_cb;
+        tx_to_cb32 = gen_rev_tx_to_cb32;
+        tx_from_cb32 = gen_rev_tx_from_cb32;
+      }
+      else
+      {
+        tx_to_cb32 = gen_irv_tx_to_cb32;
+        tx_from_cb32 = gen_irv_tx_from_cb32;
+      }
+      encode_cb32 = ojph_encode_codeblock32;
+
+      decode_cb64 = ojph_decode_codeblock64;
+      find_max_val64 = gen_find_max_val64;
+      if (reversible) {
+        tx_to_cb64 = gen_rev_tx_to_cb64;
+        tx_from_cb64 = gen_rev_tx_from_cb64;
       }
       else
       {
-        tx_to_cb = gen_irv_tx_to_cb;
-        tx_from_cb = gen_irv_tx_from_cb;
+        tx_to_cb64 = NULL;
+        tx_from_cb64 = NULL;
       }
-      encode_cb = ojph_encode_codeblock;
+      encode_cb64 = ojph_encode_codeblock64;
+      bool result = initialize_block_encoder_tables();
+      assert(result); ojph_unused(result);      
 
-#ifndef OJPH_DISABLE_INTEL_SIMD
+  #ifndef OJPH_DISABLE_SIMD
+
+    #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
 
       // Accelerated functions for INTEL/AMD CPUs
-      if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE)
-        mem_clear = sse_mem_clear;
-
-      if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) {
-        find_max_val = sse2_find_max_val;
-        if (reversible) {
-          tx_to_cb = sse2_rev_tx_to_cb;
-          tx_from_cb = sse2_rev_tx_from_cb;
-        }
-        else {
-          tx_to_cb = sse2_irv_tx_to_cb;
-          tx_from_cb = sse2_irv_tx_from_cb;
+      #ifndef OJPH_DISABLE_SSE
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE)
+          mem_clear = sse_mem_clear;
+      #endif // !OJPH_DISABLE_SSE
+
+      #ifndef OJPH_DISABLE_SSE2
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) {
+          find_max_val32 = sse2_find_max_val32;
+          if (reversible) {
+            tx_to_cb32 = sse2_rev_tx_to_cb32;
+            tx_from_cb32 = sse2_rev_tx_from_cb32;
+          }
+          else {
+            tx_to_cb32 = sse2_irv_tx_to_cb32;
+            tx_from_cb32 = sse2_irv_tx_from_cb32;
+          }
+          find_max_val64 = sse2_find_max_val64;
+          if (reversible) {
+            tx_to_cb64 = sse2_rev_tx_to_cb64;
+            tx_from_cb64 = sse2_rev_tx_from_cb64;
+          }
+          else
+          {
+            tx_to_cb64 = NULL;
+            tx_from_cb64 = NULL;
+          }
         }
-      }
+      #endif // !OJPH_DISABLE_SSE2
 
-      if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSSE3)
-        decode_cb = ojph_decode_codeblock_ssse3;
+      #ifndef OJPH_DISABLE_SSSE3
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSSE3)
+          decode_cb32 = ojph_decode_codeblock_ssse3;
+      #endif // !OJPH_DISABLE_SSSE3
 
+      #ifndef OJPH_DISABLE_AVX
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX)
+          mem_clear = avx_mem_clear;
+      #endif // !OJPH_DISABLE_AVX
 
-      if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX)
-        mem_clear = avx_mem_clear;
+      #ifndef OJPH_DISABLE_AVX2
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) {
+          decode_cb32 = ojph_decode_codeblock_avx2;
+          find_max_val32 = avx2_find_max_val32;
+          if (reversible) {
+            tx_to_cb32 = avx2_rev_tx_to_cb32;
+            tx_from_cb32 = avx2_rev_tx_from_cb32;
+          }
+          else {
+            tx_to_cb32 = avx2_irv_tx_to_cb32;
+            tx_from_cb32 = avx2_irv_tx_from_cb32;
+          }
+          encode_cb32 = ojph_encode_codeblock_avx2;
+          bool result = initialize_block_encoder_tables_avx2();
+          assert(result); ojph_unused(result);
 
-      if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) {
-        find_max_val = avx2_find_max_val;
-        if (reversible) {
-          tx_to_cb = avx2_rev_tx_to_cb;
-          tx_from_cb = avx2_rev_tx_from_cb;
+          find_max_val64 = avx2_find_max_val64;
+          if (reversible) {
+            tx_to_cb64 = avx2_rev_tx_to_cb64;
+            tx_from_cb64 = avx2_rev_tx_from_cb64;
+          }
+          else
+          {
+            tx_to_cb64 = NULL;
+            tx_from_cb64 = NULL;
+          }
         }
-        else {
-          tx_to_cb = avx2_irv_tx_to_cb;
-          tx_from_cb = avx2_irv_tx_from_cb;
+      #endif // !OJPH_DISABLE_AVX2
+
+      #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512))
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) {
+          encode_cb32 = ojph_encode_codeblock_avx512;
+          bool result = initialize_block_encoder_tables_avx512();
+          assert(result); ojph_unused(result);
         }
-      }
+      #endif // !OJPH_DISABLE_AVX512
 
-#ifdef OJPH_ENABLE_INTEL_AVX512
-      if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512)
-        encode_cb = ojph_encode_codeblock_avx512;
-#endif // !OJPH_ENABLE_INTEL_AVX512
+    #elif defined(OJPH_ARCH_ARM)
+    
+    #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
 
-#endif // !OJPH_DISABLE_INTEL_SIMD
+  #endif // !OJPH_DISABLE_SIMD
 
 #else // OJPH_ENABLE_WASM_SIMD
 
       // Accelerated functions for WASM SIMD.
-      decode_cb = ojph_decode_codeblock_wasm;
-      find_max_val = wasm_find_max_val;
+      decode_cb32 = ojph_decode_codeblock_wasm;
+      find_max_val32 = wasm_find_max_val32;
       mem_clear = wasm_mem_clear;
       if (reversible) {
-        tx_to_cb = wasm_rev_tx_to_cb;
-        tx_from_cb = wasm_rev_tx_from_cb;
+        tx_to_cb32 = wasm_rev_tx_to_cb32;
+        tx_from_cb32 = wasm_rev_tx_from_cb32;
       }
       else {
-        tx_to_cb = wasm_irv_tx_to_cb;
-        tx_from_cb = wasm_irv_tx_from_cb;
+        tx_to_cb32 = wasm_irv_tx_to_cb32;
+        tx_from_cb32 = wasm_irv_tx_from_cb32;
+      }
+      encode_cb32 = ojph_encode_codeblock32;
+
+      decode_cb64 = ojph_decode_codeblock64;
+      find_max_val64 = wasm_find_max_val64;
+      if (reversible) {
+        tx_to_cb64 = wasm_rev_tx_to_cb64;
+        tx_from_cb64 = wasm_rev_tx_from_cb64;
+      }
+      else
+      {
+        tx_to_cb64 = NULL;
+        tx_from_cb64 = NULL;
       }
-      encode_cb = ojph_encode_codeblock;
+      encode_cb64 = ojph_encode_codeblock64;
+      bool result = initialize_block_encoder_tables();
+      assert(result); ojph_unused(result);      
 
 #endif // !OJPH_ENABLE_WASM_SIMD
 
diff --git a/src/core/codestream/ojph_codeblock_fun.h b/src/core/codestream/ojph_codeblock_fun.h
index 679b2d34..67fbc2b7 100644
--- a/src/core/codestream/ojph_codeblock_fun.h
+++ b/src/core/codestream/ojph_codeblock_fun.h
@@ -51,23 +51,40 @@ namespace ojph {
     typedef void (*mem_clear_fun)(void* addr, size_t count);
 
     // define function signature for max value finding
-    typedef ui32 (*find_max_val_fun)(ui32* addr);
+    typedef ui32 (*find_max_val_fun32)(ui32* addr);
+
+    typedef ui64 (*find_max_val_fun64)(ui64* addr);
 
     // define line transfer function signature from subbands to codeblocks
-    typedef void (*tx_to_cb_fun)(const void *sp, ui32 *dp, ui32 K_max,
+    typedef void (*tx_to_cb_fun32)(const void *sp, ui32 *dp, ui32 K_max,
                                    float delta_inv, ui32 count, ui32* max_val);
 
+    typedef void (*tx_to_cb_fun64)(const void *sp, ui64 *dp, ui32 K_max,
+                                   float delta_inv, ui32 count, ui64* max_val);
+
     // define line transfer function signature from codeblock to subband
-    typedef void (*tx_from_cb_fun)(const ui32 *sp, void *dp, ui32 K_max,
+    typedef void (*tx_from_cb_fun32)(const ui32 *sp, void *dp, ui32 K_max,
+                                     float delta, ui32 count);
+
+    typedef void (*tx_from_cb_fun64)(const ui64 *sp, void *dp, ui32 K_max,
                                      float delta, ui32 count);
 
     // define the block decoder function signature
-    typedef bool (*cb_decoder_fun)(ui8* coded_data, ui32* decoded_data,
+    typedef bool (*cb_decoder_fun32)(ui8* coded_data, ui32* decoded_data,
+      ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
+      ui32 width, ui32 height, ui32 stride, bool stripe_causal);
+
+    typedef bool (*cb_decoder_fun64)(ui8* coded_data, ui64* decoded_data,
       ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
       ui32 width, ui32 height, ui32 stride, bool stripe_causal);
 
     // define the block encoder function signature
-    typedef void (*cb_encoder_fun)(ui32* buf, ui32 missing_msbs, 
+    typedef void (*cb_encoder_fun32)(ui32* buf, ui32 missing_msbs, 
+      ui32 num_passes, ui32 width, ui32 height, ui32 stride,
+      ui32* lengths, ojph::mem_elastic_allocator* elastic,
+      ojph::coded_lists*& coded);
+
+    typedef void (*cb_encoder_fun64)(ui64* buf, ui32 missing_msbs, 
       ui32 num_passes, ui32 width, ui32 height, ui32 stride,
       ui32* lengths, ojph::mem_elastic_allocator* elastic,
       ojph::coded_lists*& coded);
@@ -81,19 +98,24 @@ namespace ojph {
       mem_clear_fun mem_clear;
      
       // a pointer to the max value finding function
-      find_max_val_fun find_max_val;
+      find_max_val_fun32 find_max_val32;
+      find_max_val_fun64 find_max_val64;
      
       // a pointer to function transferring samples from subbands to codeblocks
-      tx_to_cb_fun tx_to_cb;
+      tx_to_cb_fun32 tx_to_cb32;
+      tx_to_cb_fun64 tx_to_cb64;
      
       // a pointer to function transferring samples from codeblocks to subbands
-      tx_from_cb_fun tx_from_cb;
+      tx_from_cb_fun32 tx_from_cb32;
+      tx_from_cb_fun64 tx_from_cb64;
      
       // a pointer to the decoder function
-      cb_decoder_fun decode_cb;
+      cb_decoder_fun32 decode_cb32;
+      cb_decoder_fun64 decode_cb64;
 
       // a pointer to the encoder function
-      cb_encoder_fun encode_cb;
+      cb_encoder_fun32 encode_cb32;
+      cb_encoder_fun64 encode_cb64;
     };
     
   }
diff --git a/src/core/codestream/ojph_codestream.cpp b/src/core/codestream/ojph_codestream.cpp
index 7036085c..8fc3cb56 100644
--- a/src/core/codestream/ojph_codestream.cpp
+++ b/src/core/codestream/ojph_codestream.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -57,7 +57,9 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   codestream::~codestream()
   {
-    if (state) delete state;
+    if (state)
+      delete state;
+    state = NULL;
   }
 
   ////////////////////////////////////////////////////////////////////////////
@@ -66,6 +68,13 @@ namespace ojph {
     state = new local::codestream;
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  void codestream::restart()
+  {
+    assert(state != NULL);
+    state->restart();
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   param_siz codestream::access_siz()
   {
@@ -84,6 +93,12 @@ namespace ojph {
     return param_qcd(&state->qcd);
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  param_nlt codestream::access_nlt()
+  {
+    return param_nlt(&state->nlt);
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   void codestream::set_planar(bool planar)
   {
@@ -97,7 +112,7 @@ namespace ojph {
   }
 
   ////////////////////////////////////////////////////////////////////////////
-  void codestream::set_tilepart_divisions(bool at_resolutions, 
+  void codestream::set_tilepart_divisions(bool at_resolutions,
                                           bool at_components)
   {
     ui32 value = 0;
@@ -108,12 +123,32 @@ namespace ojph {
     state->set_tilepart_divisions(value);
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  bool codestream::is_tilepart_division_at_resolutions()
+  {
+    ui32 res = state->get_tilepart_div() & OJPH_TILEPART_RESOLUTIONS;
+    return res ? true : false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  bool codestream::is_tilepart_division_at_components()
+  {
+    ui32 comp = state->get_tilepart_div() & OJPH_TILEPART_COMPONENTS;
+    return comp ? true : false;
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   void codestream::request_tlm_marker(bool needed)
   {
     state->request_tlm_marker(needed);
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  bool codestream::is_tlm_requested()
+  {
+    return state->is_tlm_needed();
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   bool codestream::is_planar() const
   {
@@ -121,7 +156,7 @@ namespace ojph {
   }
 
   ////////////////////////////////////////////////////////////////////////////
-  void codestream::write_headers(outfile_base *file, 
+  void codestream::write_headers(outfile_base *file,
                                  const comment_exchange* comments,
                                  ui32 num_comments)
   {
diff --git a/src/core/codestream/ojph_codestream_avx.cpp b/src/core/codestream/ojph_codestream_avx.cpp
index 4c6d678d..19c5a188 100644
--- a/src/core/codestream/ojph_codestream_avx.cpp
+++ b/src/core/codestream/ojph_codestream_avx.cpp
@@ -35,6 +35,8 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
 #include <immintrin.h>
 #include "ojph_defs.h"
 
@@ -51,4 +53,6 @@ namespace ojph {
     }
     
  }
-}
\ No newline at end of file
+}
+
+#endif
diff --git a/src/core/codestream/ojph_codestream_avx2.cpp b/src/core/codestream/ojph_codestream_avx2.cpp
index 04a81ed0..ca6c1b46 100644
--- a/src/core/codestream/ojph_codestream_avx2.cpp
+++ b/src/core/codestream/ojph_codestream_avx2.cpp
@@ -35,14 +35,19 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
+#include <climits>
 #include <immintrin.h>
 #include "ojph_defs.h"
+#include "ojph_arch.h"
 
 namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 avx2_find_max_val(ui32* address)
+    ui32 avx2_find_max_val32(ui32* address)
     {
       __m128i x0 = _mm_loadu_si128((__m128i*)address);
       __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
@@ -56,17 +61,37 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                           float delta_inv, ui32 count, ui32* max_val)
+    ui64 avx2_find_max_val64(ui64* address)
+    {
+      __m128i x0 = _mm_loadu_si128((__m128i*)address);
+      __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
+      x0 = _mm_or_si128(x0, x1);
+      x1 = _mm_shuffle_epi32(x0, 0xEE);   // x1 = x0[2,3,2,3]
+      x0 = _mm_or_si128(x0, x1);
+      ui64 t;
+#ifdef OJPH_ARCH_X86_64
+      t = (ui64)_mm_extract_epi64(x0, 0);
+#elif (defined OJPH_ARCH_I386)
+      t = (ui64)(ui32)_mm_extract_epi32(x0, 0);
+      t |= (ui64)(ui32)_mm_extract_epi32(x0, 1) << 32;
+#else
+      #error Error unsupport compiler
+#endif      
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(delta_inv);
 
       // convert to sign and magnitude and keep max_val      
       ui32 shift = 31 - K_max;
-      __m256i m0 = _mm256_set1_epi32((int)0x80000000);
+      __m256i m0 = _mm256_set1_epi32(INT_MIN);
       __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
       __m256i *p = (__m256i*)sp;
-      for (ui32 i = 0; i < count; i += 8, p += 1, dp += 8)
+      for ( ; count >= 8; count -= 8, p += 1, dp += 8)
       {
         __m256i v = _mm256_loadu_si256(p);
         __m256i sign = _mm256_and_si256(v, m0);
@@ -76,22 +101,38 @@ namespace ojph {
         val = _mm256_or_si256(val, sign);
         _mm256_storeu_si256((__m256i*)dp, val);
       }
+      if (count)
+      {
+        __m256i v = _mm256_loadu_si256(p);
+        __m256i sign = _mm256_and_si256(v, m0);
+        __m256i val = _mm256_abs_epi32(v);
+        val = _mm256_slli_epi32(val, (int)shift);
+
+        __m256i c = _mm256_set1_epi32((si32)count);
+        __m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        __m256i mask = _mm256_cmpgt_epi32(c, idx);
+        c = _mm256_and_si256(val, mask);
+        tmax = _mm256_or_si256(tmax, c);
+
+        val = _mm256_or_si256(val, sign);
+        _mm256_storeu_si256((__m256i*)dp, val);
+      }
       _mm256_storeu_si256((__m256i*)max_val, tmax);
     }
-                           
+
     //////////////////////////////////////////////////////////////////////////
-    void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val)
+    void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(K_max);
 
       //quantize and convert to sign and magnitude and keep max_val
       __m256 d = _mm256_set1_ps(delta_inv);
-      __m256i m0 = _mm256_set1_epi32((int)0x80000000);
+      __m256i m0 = _mm256_set1_epi32(INT_MIN);
       __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
       float *p = (float*)sp;
       
-      for (ui32 i = 0; i < count; i += 8, p += 8, dp += 8)
+      for ( ; count >= 8; count -= 8, p += 8, dp += 8)
       {
         __m256 vf = _mm256_loadu_ps(p);
         vf = _mm256_mul_ps(vf, d);                // multiply
@@ -102,33 +143,50 @@ namespace ojph {
         val = _mm256_or_si256(val, sign);
         _mm256_storeu_si256((__m256i*)dp, val);
       }
+      if (count)
+      {
+        __m256 vf = _mm256_loadu_ps(p);
+        vf = _mm256_mul_ps(vf, d);                // multiply
+        __m256i val = _mm256_cvtps_epi32(vf);     // convert to int
+        __m256i sign = _mm256_and_si256(val, m0); // get sign
+        val = _mm256_abs_epi32(val);
+
+        __m256i c = _mm256_set1_epi32((si32)count);
+        __m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        __m256i mask = _mm256_cmpgt_epi32(c, idx);
+        c = _mm256_and_si256(val, mask);
+        tmax = _mm256_or_si256(tmax, c);
+
+        val = _mm256_or_si256(val, sign);
+        _mm256_storeu_si256((__m256i*)dp, val);
+      }
       _mm256_storeu_si256((__m256i*)max_val, tmax);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
-      __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF);
+      __m256i m1 = _mm256_set1_epi32(INT_MAX);
       si32 *p = (si32*)dp;
       for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
       {
-          __m256i v = _mm256_load_si256((__m256i*)sp);
-          __m256i val = _mm256_and_si256(v, m1);
-          val = _mm256_srli_epi32(val, (int)shift);
-          val = _mm256_sign_epi32(val, v);
-          _mm256_storeu_si256((__m256i*)p, val);
+        __m256i v = _mm256_load_si256((__m256i*)sp);
+        __m256i val = _mm256_and_si256(v, m1);
+        val = _mm256_srli_epi32(val, (int)shift);
+        val = _mm256_sign_epi32(val, v);
+        _mm256_storeu_si256((__m256i*)p, val);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(K_max);
-      __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF);
+      __m256i m1 = _mm256_set1_epi32(INT_MAX);
       __m256 d = _mm256_set1_ps(delta);
       float *p = (float*)dp;
       for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
@@ -142,5 +200,79 @@ namespace ojph {
         _mm256_storeu_ps(p, valf);
       }
     }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val      
+      ui32 shift = 63 - K_max;
+      __m256i m0 = _mm256_set1_epi64x(LLONG_MIN);
+      __m256i zero = _mm256_setzero_si256();
+      __m256i one = _mm256_set1_epi64x(1);
+      __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
+      __m256i *p = (__m256i*)sp;
+      for ( ; count >= 4; count -= 4, p += 1, dp += 4)
+      {
+        __m256i v = _mm256_loadu_si256(p);
+        __m256i sign = _mm256_cmpgt_epi64(zero, v);
+        __m256i val = _mm256_xor_si256(v, sign);  // negate 1's complement
+        __m256i ones = _mm256_and_si256(sign, one);
+        val = _mm256_add_epi64(val, ones);        // 2's complement
+        sign = _mm256_and_si256(sign, m0);
+        val = _mm256_slli_epi64(val, (int)shift);
+        tmax = _mm256_or_si256(tmax, val);
+        val = _mm256_or_si256(val, sign);
+        _mm256_storeu_si256((__m256i*)dp, val);
+      }
+      if (count)
+      {
+        __m256i v = _mm256_loadu_si256(p);
+        __m256i sign = _mm256_cmpgt_epi64(zero, v);
+        __m256i val = _mm256_xor_si256(v, sign);  // negate 1's complement
+        __m256i ones = _mm256_and_si256(sign, one);
+        val = _mm256_add_epi64(val, ones);        // 2's complement
+        sign = _mm256_and_si256(sign, m0);
+        val = _mm256_slli_epi64(val, (int)shift);
+
+        __m256i c = _mm256_set1_epi64x(count);
+        __m256i idx = _mm256_set_epi64x(3, 2, 1, 0);
+        __m256i mask = _mm256_cmpgt_epi64(c, idx);
+        c = _mm256_and_si256(val, mask);
+        tmax = _mm256_or_si256(tmax, c);
+
+        val = _mm256_or_si256(val, sign);
+        _mm256_storeu_si256((__m256i*)dp, val);
+      }
+      _mm256_storeu_si256((__m256i*)max_val, tmax);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      
+      ui32 shift = 63 - K_max;
+      __m256i m1 = _mm256_set1_epi64x(LLONG_MAX);
+      __m256i zero = _mm256_setzero_si256();
+      __m256i one = _mm256_set1_epi64x(1);
+      si64 *p = (si64*)dp;
+      for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
+      {
+        __m256i v = _mm256_load_si256((__m256i*)sp);
+        __m256i val = _mm256_and_si256(v, m1);
+        val = _mm256_srli_epi64(val, (int)shift);
+        __m256i sign = _mm256_cmpgt_epi64(zero, v);
+        val = _mm256_xor_si256(val, sign); // negate 1's complement
+        __m256i ones = _mm256_and_si256(sign, one);
+        val = _mm256_add_epi64(val, ones); // 2's complement
+        _mm256_storeu_si256((__m256i*)p, val);
+      }
+    }
   }
-}
\ No newline at end of file
+}
+
+#endif
diff --git a/src/core/codestream/ojph_codestream_gen.cpp b/src/core/codestream/ojph_codestream_gen.cpp
index 466f4835..cdc72c6e 100644
--- a/src/core/codestream/ojph_codestream_gen.cpp
+++ b/src/core/codestream/ojph_codestream_gen.cpp
@@ -44,18 +44,21 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void gen_mem_clear(void* addr, size_t count)
     {
-      ui32* p = (ui32*)addr;
-      for (size_t i = 0; i < count; i += 4, p += 1)
-        *p = 0;
+      si64* p = (si64*)addr;
+      for (size_t i = 0; i < count; i += 8)
+        *p++ = 0;
     }
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 gen_find_max_val(ui32* addr) { return addr[0]; }
+    ui32 gen_find_max_val32(ui32* addr) { return addr[0]; }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                                     float delta_inv, ui32 count, 
-                                     ui32* max_val)
+    ui64 gen_find_max_val64(ui64* addr) { return addr[0]; }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                            float delta_inv, ui32 count, 
+                            ui32* max_val)
     {
       ojph_unused(delta_inv);
       ui32 shift = 31 - K_max;
@@ -65,7 +68,7 @@ namespace ojph {
       for (ui32 i = count; i > 0; --i)
       {
         si32 v = *p++;
-        ui32 sign = v >= 0 ? 0 : 0x80000000;
+        ui32 sign = v >= 0 ? 0U : 0x80000000U;
         ui32 val = (ui32)(v >= 0 ? v : -v);
         val <<= shift;
         *dp++ = sign | val;
@@ -75,9 +78,31 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                                     float delta_inv, ui32 count, 
-                                     ui32* max_val)
+    void gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                            float delta_inv, ui32 count, 
+                            ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+      ui32 shift = 63 - K_max;
+      // convert to sign and magnitude and keep max_val
+      ui64 tmax = *max_val;
+      si64 *p = (si64*)sp;
+      for (ui32 i = count; i > 0; --i)
+      {
+        si64 v = *p++;
+        ui64 sign = v >= 0 ? 0ULL : 0x8000000000000000ULL;
+        ui64 val = (ui64)(v >= 0 ? v : -v);
+        val <<= shift;
+        *dp++ = sign | val;
+        tmax |= val; // it is more efficient to use or than max
+      }
+      *max_val = tmax;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                            float delta_inv, ui32 count, 
+                            ui32* max_val)
     {
       ojph_unused(K_max);
       //quantize and convert to sign and magnitude and keep max_val
@@ -87,7 +112,7 @@ namespace ojph {
       {
         float v = *p++;
         si32 t = ojph_trunc(v * delta_inv);
-        ui32 sign = t >= 0 ? 0 : 0x80000000;
+        ui32 sign = t >= 0 ? 0U : 0x80000000U;
         ui32 val = (ui32)(t >= 0 ? t : -t);
         *dp++ = sign | val;
         tmax |= val; // it is more efficient to use or than max
@@ -96,8 +121,8 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                                       float delta, ui32 count)
+    void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                              float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
@@ -106,14 +131,30 @@ namespace ojph {
       for (ui32 i = count; i > 0; --i)
       {
         ui32 v = *sp++;
-        si32 val = (v & 0x7FFFFFFF) >> shift;
-        *p++ = (v & 0x80000000) ? -val : val;
+        si32 val = (v & 0x7FFFFFFFU) >> shift;
+        *p++ = (v & 0x80000000U) ? -val : val;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                              float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 63 - K_max;
+      //convert to sign and magnitude
+      si64 *p = (si64*)dp;
+      for (ui32 i = count; i > 0; --i)
+      {
+        ui64 v = *sp++;
+        si64 val = (v & 0x7FFFFFFFFFFFFFFFULL) >> shift;
+        *p++ = (v & 0x8000000000000000ULL) ? -val : val;
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                                       float delta, ui32 count)
+    void gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                              float delta, ui32 count)
     {
       ojph_unused(K_max);
       //convert to sign and magnitude
@@ -121,8 +162,8 @@ namespace ojph {
       for (ui32 i = count; i > 0; --i)
       {
         ui32 v = *sp++;
-        float val = (float)(v & 0x7FFFFFFF) * delta;
-        *p++ = (v & 0x80000000) ? -val : val;
+        float val = (float)(v & 0x7FFFFFFFU) * delta;
+        *p++ = (v & 0x80000000U) ? -val : val;
       }
     }
     
diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp
index cc074298..066d184c 100644
--- a/src/core/codestream/ojph_codestream_local.cpp
+++ b/src/core/codestream/ojph_codestream_local.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -52,15 +52,35 @@ namespace ojph {
   namespace local
   {
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     codestream::codestream()
     : precinct_scratch(NULL), allocator(NULL), elastic_alloc(NULL)
+    {
+      allocator = new mem_fixed_allocator;
+      elastic_alloc = new mem_elastic_allocator(1048576); // 1 megabyte
+
+      init_colour_transform_functions();
+      init_wavelet_transform_functions();
+
+      restart();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    codestream::~codestream()
+    {
+      if (allocator)
+        delete allocator;
+      if (elastic_alloc)
+        delete elastic_alloc;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void codestream::restart()
     {
       tiles = NULL;
       lines = NULL;
       comp_size = NULL;
       recon_comp_size = NULL;
-      allocator = NULL;
       outfile = NULL;
       infile = NULL;
 
@@ -68,7 +88,7 @@ namespace ojph {
       employ_color_transform = false;
       planar = -1;
       profile = OJPH_PN_UNDEFINED;
-      tilepart_div = OJPH_TILEPART_NODIVSIONS;
+      tilepart_div = OJPH_TILEPART_NO_DIVISIONS;
       need_tlm = false;
 
       cur_comp = 0;
@@ -79,25 +99,14 @@ namespace ojph {
 
       precinct_scratch_needed_bytes = 0;
 
-      used_qcc_fields = 0;
-      qcc = qcc_store;
-
-      allocator = new mem_fixed_allocator;
-      elastic_alloc = new mem_elastic_allocator(1048576); //1 megabyte
-
-      init_colour_transform_functions();
-      init_wavelet_transform_functions();
-    }
+      cod.restart();
+      qcd.restart();
+      nlt.restart();
+      dfs.restart();
+      atk.restart();
 
-    ////////////////////////////////////////////////////////////////////////////
-    codestream::~codestream()
-    {
-      if (qcc_store != qcc)
-        delete[] qcc;
-      if (allocator)
-        delete allocator;
-      if (elastic_alloc)
-        delete elastic_alloc;
+      allocator->restart();
+      elastic_alloc->restart();
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -109,10 +118,12 @@ namespace ojph {
       num_tiles.h = sz.get_image_extent().y - sz.get_tile_offset().y;
       num_tiles.h = ojph_div_ceil(num_tiles.h, sz.get_tile_size().h);
       if (num_tiles.area() > 65535)
-        OJPH_ERROR(0x00030011, "number of tiles cannot exceed 65535");
+        OJPH_ERROR(0x00030011, "the number of tiles cannot exceed 65535");
+      if (num_tiles.area() == 0)
+        OJPH_ERROR(0x00030012, "the number of tiles cannot be 0");
 
       //allocate tiles
-      allocator->pre_alloc_obj<tile>(num_tiles.area());
+      allocator->pre_alloc_obj<tile>((size_t)num_tiles.area());
 
       ui32 num_tileparts = 0;
       point index;
@@ -125,10 +136,10 @@ namespace ojph {
         ui32 y1 = y0 + sz.get_tile_size().h; //end of tile
 
         tile_rect.org.y = ojph_max(y0, sz.get_image_offset().y);
-        tile_rect.siz.h = 
+        tile_rect.siz.h =
           ojph_min(y1, sz.get_image_extent().y) - tile_rect.org.y;
 
-        recon_tile_rect.org.y = ojph_max(ojph_div_ceil(y0, ds), 
+        recon_tile_rect.org.y = ojph_max(ojph_div_ceil(y0, ds),
           ojph_div_ceil(sz.get_image_offset().y, ds));
         recon_tile_rect.siz.h = ojph_min(ojph_div_ceil(y1, ds),
           ojph_div_ceil(sz.get_image_extent().y, ds))
@@ -141,7 +152,7 @@ namespace ojph {
           ui32 x1 = x0 + sz.get_tile_size().w;
 
           tile_rect.org.x = ojph_max(x0, sz.get_image_offset().x);
-          tile_rect.siz.w = 
+          tile_rect.siz.w =
             ojph_min(x1, sz.get_image_extent().x) - tile_rect.org.x;
 
           recon_tile_rect.org.x = ojph_max(ojph_div_ceil(x0, ds),
@@ -178,8 +189,6 @@ namespace ojph {
       for (ui32 r = 0; r <= num_decomps; ++r)
       {
         size log_PP = cod.get_log_precinct_size(r);
-        log_PP.w -= (r ? 1 : 0);
-        log_PP.h -= (r ? 1 : 0);
         ratio.w = ojph_max(ratio.w, log_PP.w - ojph_min(log_cb.w, log_PP.w));
         ratio.h = ojph_max(ratio.h, log_PP.h - ojph_min(log_cb.h, log_PP.h));
       }
@@ -190,9 +199,9 @@ namespace ojph {
       // (rounding up leaves one extra entry).
       // This exta entry is necessary
       // We need 4 such tables. These tables store
-      // 1. missing msbs and 2. their flags, 
+      // 1. missing msbs and 2. their flags,
       // 3. number of layers and 4. their flags
-      precinct_scratch_needed_bytes = 
+      precinct_scratch_needed_bytes =
         4 * ((max_ratio * max_ratio * 4 + 2) / 3);
 
       allocator->pre_alloc_obj<ui8>(precinct_scratch_needed_bytes);
@@ -204,17 +213,16 @@ namespace ojph {
       allocator->alloc();
 
       //precinct scratch buffer
-      precinct_scratch = 
+      precinct_scratch =
         allocator->post_alloc_obj<ui8>(precinct_scratch_needed_bytes);
 
       //get tiles
-      tiles = this->allocator->post_alloc_obj<tile>(num_tiles.area());
+      tiles = this->allocator->post_alloc_obj<tile>((size_t)num_tiles.area());
 
       ui32 num_tileparts = 0;
       point index;
-      rect tile_rect, recon_tile_rect;
+      rect tile_rect;
       ojph::param_siz sz = access_siz();
-      ui32 ds = 1 << skipped_res_for_recon;
       for (index.y = 0; index.y < num_tiles.h; ++index.y)
       {
         ui32 y0 = sz.get_tile_offset().y
@@ -222,15 +230,9 @@ namespace ojph {
         ui32 y1 = y0 + sz.get_tile_size().h; //end of tile
 
         tile_rect.org.y = ojph_max(y0, sz.get_image_offset().y);
-        tile_rect.siz.h = 
+        tile_rect.siz.h =
           ojph_min(y1, sz.get_image_extent().y) - tile_rect.org.y;
 
-        recon_tile_rect.org.y = ojph_max(ojph_div_ceil(y0, ds), 
-          ojph_div_ceil(sz.get_image_offset().y, ds));
-        recon_tile_rect.siz.h = ojph_min(ojph_div_ceil(y1, ds),
-          ojph_div_ceil(sz.get_image_extent().y, ds))
-          - recon_tile_rect.org.y;
-
         ui32 offset = 0;
         for (index.x = 0; index.x < num_tiles.w; ++index.x)
         {
@@ -239,20 +241,12 @@ namespace ojph {
           ui32 x1 = x0 + sz.get_tile_size().w;
 
           tile_rect.org.x = ojph_max(x0, sz.get_image_offset().x);
-          tile_rect.siz.w = 
+          tile_rect.siz.w =
             ojph_min(x1, sz.get_image_extent().x) - tile_rect.org.x;
 
-          recon_tile_rect.org.x = ojph_max(ojph_div_ceil(x0, ds),
-            ojph_div_ceil(sz.get_image_offset().x, ds));
-          recon_tile_rect.siz.w = ojph_min(ojph_div_ceil(x1, ds),
-            ojph_div_ceil(sz.get_image_extent().x, ds))
-            - recon_tile_rect.org.x;
-
           ui32 tps = 0; // number of tileparts for this tile
           ui32 idx = index.y * num_tiles.w + index.x;
-          tiles[idx].finalize_alloc(this, tile_rect, recon_tile_rect,
-            idx, offset, tps);
-          offset += recon_tile_rect.siz.w;
+          tiles[idx].finalize_alloc(this, tile_rect, idx, offset, tps);
           num_tileparts += tps;
         }
       }
@@ -272,7 +266,7 @@ namespace ojph {
         ui32 cw = siz.get_recon_width(i);
         recon_comp_size[i].w = cw;
         recon_comp_size[i].h = siz.get_recon_height(i);
-        lines[i].wrap(allocator->post_alloc_data<si32>(cw, 0), cw, 0);        
+        lines[i].wrap(allocator->post_alloc_data<si32>(cw, 0), cw, 0);
       }
 
       cur_comp = 0;
@@ -541,34 +535,46 @@ namespace ojph {
       if (tilepart_div != OJPH_TILEPART_COMPONENTS)
       {
         tilepart_div = OJPH_TILEPART_COMPONENTS;
-        OJPH_WARN(0x000300B1, 
+        OJPH_WARN(0x000300B1,
           "In BROADCAST profile, tile part divisions at the component level "
           "must be employed, while at the resolution level is not allowed. "
           "This has been corrected.");
-      }    
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void codestream::write_headers(outfile_base *file, 
+    void codestream::write_headers(outfile_base *file,
                                    const comment_exchange* comments,
                                    ui32 num_comments)
     {
       //finalize
+      siz.set_cod(cod);
+      // set the tile size if it was not set by the user
+      size tile_size = siz.get_tile_size();
+      if (tile_size.h == 0 && tile_size.w == 0)
+      {
+        point img_offset = siz.get_image_offset();
+        point img_extent = siz.get_image_extent();
+        size t(img_extent.x + img_offset.x, img_extent.y + img_offset.y);
+        siz.set_tile_size(t);
+      }
       siz.check_validity();
       cod.check_validity(siz);
+      cod.update_atk(&atk);
       qcd.check_validity(siz, cod);
       cap.check_validity(cod, qcd);
+      nlt.check_validity(siz);
       if (profile == OJPH_PN_IMF)
         check_imf_validity();
       else if (profile == OJPH_PN_BROADCAST)
         check_broadcast_validity();
 
       int po = ojph::param_cod(&cod).get_progression_order();
-      if ((po == OJPH_PO_LRCP || po == OJPH_PO_RLCP) && 
+      if ((po == OJPH_PO_LRCP || po == OJPH_PO_RLCP) &&
            tilepart_div == OJPH_TILEPART_COMPONENTS)
       {
         tilepart_div |= OJPH_TILEPART_RESOLUTIONS;
-        OJPH_INFO(0x00030021, 
+        OJPH_INFO(0x00030021,
           "For LRCP and RLCP progression orders, tilepart divisions at the "
           "component level, means that we have a tilepart for every "
           "resolution and component.\n");
@@ -637,9 +643,18 @@ namespace ojph {
       if (!cod.write(file))
         OJPH_ERROR(0x00030025, "Error writing to file");
 
+      if (!cod.write_coc(file, num_comps))
+        OJPH_ERROR(0x0003002E, "Error writing to file");
+
       if (!qcd.write(file))
         OJPH_ERROR(0x00030026, "Error writing to file");
 
+      if (!qcd.write_qcc(file, num_comps))
+        OJPH_ERROR(0x0003002D, "Error writing to file");
+
+      if (!nlt.write(file))
+        OJPH_ERROR(0x00030027, "Error writing to file");
+
       char buf[] = "      OpenJPH Ver "
         OJPH_INT_TO_STRING(OPENJPH_VERSION_MAJOR) "."
         OJPH_INT_TO_STRING(OPENJPH_VERSION_MINOR) "."
@@ -648,25 +663,25 @@ namespace ojph {
       *(ui16*)buf = swap_byte(JP2K_MARKER::COM);
       *(ui16*)(buf + 2) = swap_byte((ui16)(len - 2));
       //1 for General use (IS 8859-15:1999 (Latin) values)
-      *(ui16*)(buf + 4) = swap_byte((ui16)(1)); 
+      *(ui16*)(buf + 4) = swap_byte((ui16)(1));
       if (file->write(buf, len) != len)
-        OJPH_ERROR(0x00030027, "Error writing to file");
+        OJPH_ERROR(0x00030028, "Error writing to file");
 
       if (comments != NULL) {
         for (ui32 i = 0; i < num_comments; ++i)
         {
           t = swap_byte(JP2K_MARKER::COM);
           if (file->write(&t, 2) != 2)
-            OJPH_ERROR(0x00030028, "Error writing to file");
+            OJPH_ERROR(0x00030029, "Error writing to file");
           t = swap_byte((ui16)(comments[i].len + 4));
           if (file->write(&t, 2) != 2)
-            OJPH_ERROR(0x00030029, "Error writing to file");
+            OJPH_ERROR(0x0003002A, "Error writing to file");
           //1 for General use (IS 8859-15:1999 (Latin) values)
           t = swap_byte(comments[i].Rcom);
           if (file->write(&t, 2) != 2)
-            OJPH_ERROR(0x0003002A, "Error writing to file");
-          if (file->write(comments[i].data, comments[i].len)!=comments[i].len)
             OJPH_ERROR(0x0003002B, "Error writing to file");
+          if (file->write(comments[i].data, comments[i].len)!=comments[i].len)
+            OJPH_ERROR(0x0003002C, "Error writing to file");
         }
       }
     }
@@ -713,21 +728,21 @@ namespace ojph {
       }
       com_len = swap_byte(com_len);
       file->seek(com_len - 2, infile_base::OJPH_SEEK_CUR);
-      if (msg != NULL && msg_level != OJPH_MSG_LEVEL::NO_MSG)
+      if (msg != NULL && msg_level != OJPH_MSG_NO_MSG)
       {
-        if (msg_level == OJPH_MSG_LEVEL::INFO)
+        if (msg_level == OJPH_MSG_INFO)
         {
-          OJPH_INFO(0x00030001, "%s\n", msg);
+          OJPH_INFO(0x00030001, "%s", msg);
         }
-        else if (msg_level == OJPH_MSG_LEVEL::WARN)
+        else if (msg_level == OJPH_MSG_WARN)
         {
-          OJPH_WARN(0x00030001, "%s\n", msg);
+          OJPH_WARN(0x00030001, "%s", msg);
         }
-        else if (msg_level == OJPH_MSG_LEVEL::ERROR)
+        else if (msg_level == OJPH_MSG_ERROR)
         {
-          OJPH_ERROR(0x00030001, "%s\n", msg);
+          OJPH_ERROR(0x00030001, "%s", msg);
         }
-        else
+        else // there is the option of ALL_MSG but it should not be used here
           assert(0);
       }
       return 0;
@@ -736,8 +751,8 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void codestream::read_headers(infile_base *file)
     {
-      ui16 marker_list[17] = { SOC, SIZ, CAP, PRF, CPF, COD, COC, QCD, QCC,
-        RGN, POC, PPM, TLM, PLM, CRG, COM, SOT };
+      ui16 marker_list[20] = { SOC, SIZ, CAP, PRF, CPF, COD, COC, QCD, QCC,
+        RGN, POC, PPM, TLM, PLM, CRG, COM, DFS, ATK, NLT, SOT };
       find_marker(file, marker_list, 1); //find SOC
       find_marker(file, marker_list + 1, 1); //find SIZ
       siz.read(file);
@@ -745,18 +760,19 @@ namespace ojph {
       int received_markers = 0; //check that COD, & QCD received
       while (true)
       {
-        marker_idx = find_marker(file, marker_list + 2, 15);
+        marker_idx = find_marker(file, marker_list + 2, 18);
         if (marker_idx == 0)
           cap.read(file);
         else if (marker_idx == 1)
           //Skipping PRF marker segment; this should not cause any issues
-          skip_marker(file, "PRF", NULL, OJPH_MSG_LEVEL::NO_MSG, false);
+          skip_marker(file, "PRF", NULL, OJPH_MSG_NO_MSG, false);
         else if (marker_idx == 2)
           //Skipping CPF marker segment; this should not cause any issues
-          skip_marker(file, "CPF", NULL, OJPH_MSG_LEVEL::NO_MSG, false);
+          skip_marker(file, "CPF", NULL, OJPH_MSG_NO_MSG, false);
         else if (marker_idx == 3)
-        { 
-          cod.read(file); received_markers |= 1; 
+        {
+          cod.read(file);
+          received_markers |= 1;
           ojph::param_cod c(&cod);
           int num_qlayers = c.get_num_layers();
           if (num_qlayers != 1)
@@ -765,51 +781,80 @@ namespace ojph {
               num_qlayers);
         }
         else if (marker_idx == 4)
-          skip_marker(file, "COC", "COC is not supported yet",
-            OJPH_MSG_LEVEL::WARN, false);
+        {
+          param_cod* p = cod.add_coc_object(param_cod::OJPH_COD_UNKNOWN);
+          p->read_coc(file, siz.get_num_components(), &cod);
+          if (p->get_comp_idx() >= siz.get_num_components())
+            OJPH_INFO(0x00030056, "The codestream carries a COC marker "
+              "segment for a component indexed by %d, which is more than the "
+              "allowed index number, since the codestream has %d components",
+              p->get_comp_idx(), num_comps);
+          param_cod *q = cod.get_coc(p->get_comp_idx());
+          if (p != q && p->get_comp_idx() == q->get_comp_idx())
+            OJPH_ERROR(0x00030057, "The codestream has two COC marker "
+              "segments for one component of index %d",  p->get_comp_idx());
+        }
         else if (marker_idx == 5)
-        { qcd.read(file); received_markers |= 2; }
+        {
+          qcd.read(file);
+          received_markers |= 2;
+        }
         else if (marker_idx == 6)
-          {
-            ui32 num_comps = siz.get_num_components();
-            if (qcc == qcc_store && 
-                num_comps * sizeof(param_qcc) > sizeof(qcc_store))
-            {
-              qcc = new param_qcc[num_comps];
-            }
-            qcc[used_qcc_fields++].read(file, num_comps);
-          }
+        {
+          param_qcd* p = qcd.add_qcc_object(param_qcd::OJPH_QCD_UNKNOWN);
+          p->read_qcc(file, siz.get_num_components());
+          if (p->get_comp_idx() >= siz.get_num_components())
+            OJPH_ERROR(0x00030054, "The codestream carries a QCC marker "
+              "segment for a component indexed by %d, which is more than the "
+              "allowed index number, since the codestream has %d components",
+              p->get_comp_idx(), num_comps);
+          param_qcd *q = qcd.get_qcc(p->get_comp_idx());
+          if (p != q && p->get_comp_idx() == q->get_comp_idx())
+            OJPH_ERROR(0x00030055, "The codestream has two QCC marker "
+              "segments for one component of index %d", p->get_comp_idx());
+        }
         else if (marker_idx == 7)
           skip_marker(file, "RGN", "RGN is not supported yet",
-            OJPH_MSG_LEVEL::WARN, false);
+            OJPH_MSG_WARN, false);
         else if (marker_idx == 8)
           skip_marker(file, "POC", "POC is not supported yet",
-            OJPH_MSG_LEVEL::WARN, false);
+            OJPH_MSG_WARN, false);
         else if (marker_idx == 9)
           skip_marker(file, "PPM", "PPM is not supported yet",
-            OJPH_MSG_LEVEL::WARN, false);
+            OJPH_MSG_WARN, false);
         else if (marker_idx == 10)
           //Skipping TLM marker segment; this should not cause any issues
-          skip_marker(file, "TLM", NULL, OJPH_MSG_LEVEL::NO_MSG, false);
+          skip_marker(file, "TLM", NULL, OJPH_MSG_NO_MSG, false);
         else if (marker_idx == 11)
           //Skipping PLM marker segment; this should not cause any issues
-          skip_marker(file, "PLM", NULL, OJPH_MSG_LEVEL::NO_MSG, false);
+          skip_marker(file, "PLM", NULL, OJPH_MSG_NO_MSG, false);
         else if (marker_idx == 12)
           //Skipping CRG marker segment;
           skip_marker(file, "CRG", "CRG has been ignored; CRG is related to"
             " where the Cb and Cr colour components are co-sited or located"
             " with respect to the Y' luma component. Perhaps, it is better"
-            " to get the indivdual components and assemble the samples"
+            " to get the individual components and assemble the samples"
             " according to your needs",
-            OJPH_MSG_LEVEL::INFO, false);
+            OJPH_MSG_INFO, false);
         else if (marker_idx == 13)
-          skip_marker(file, "COM", NULL, OJPH_MSG_LEVEL::NO_MSG, false);
+          skip_marker(file, "COM", NULL, OJPH_MSG_NO_MSG, false);
         else if (marker_idx == 14)
+          dfs.read(file);
+        else if (marker_idx == 15)
+          atk.read(file);
+        else if (marker_idx == 16)
+          nlt.read(file);
+        else if (marker_idx == 17)
           break;
         else
           OJPH_ERROR(0x00030051, "File ended before finding a tile segment");
       }
 
+      cod.update_atk(&atk);
+      siz.link(&cod);
+      if (dfs.exists())
+        siz.link(&dfs);
+
       if (received_markers != 3)
         OJPH_ERROR(0x00030052, "markers error, COD and QCD are required");
 
@@ -824,7 +869,7 @@ namespace ojph {
       if (skipped_res_for_read < skipped_res_for_recon)
         OJPH_ERROR(0x000300A1,
           "skipped_resolution for data %d must be equal or smaller than "
-          " skipped_resolution for reconstruction %d\n", 
+          " skipped_resolution for reconstruction %d\n",
           skipped_res_for_read, skipped_res_for_recon);
       if (skipped_res_for_read > cod.get_num_decompositions())
         OJPH_ERROR(0x000300A2,
@@ -858,166 +903,181 @@ namespace ojph {
         if (sot.read(infile, resilient))
         {
           ui64 tile_start_location = (ui64)infile->tell();
+          bool skip_tile = false;
 
-          if (sot.get_tile_index() > (int)num_tiles.area())
+          if (sot.get_tile_index() >= (int)num_tiles.area())
           {
-            if (resilient)
+            if (resilient) {
               OJPH_INFO(0x00030061, "wrong tile index")
+              skip_tile = true; // skip the faulty tile
+            }
             else
               OJPH_ERROR(0x00030061, "wrong tile index")
           }
 
-          if (sot.get_tile_part_index())
-          { //tile part
-            if (sot.get_num_tile_parts() &&
-              sot.get_tile_part_index() >= sot.get_num_tile_parts())
-            {
-              if (resilient)
-                OJPH_INFO(0x00030062,
-                  "error in tile part number, should be smaller than total"
-                  " number of tile parts")
-              else
-                OJPH_ERROR(0x00030062,
-                  "error in tile part number, should be smaller than total"
-                  " number of tile parts")
-            }
-
-            bool sod_found = false;
-            ui16 other_tile_part_markers[6] = { SOT, POC, PPT, PLT, COM, SOD };
-            while (true)
-            {
-              int marker_idx = 0;
-              int result = 0;
-              marker_idx = find_marker(infile, other_tile_part_markers + 1, 5);
-              if (marker_idx == 0)
-                result = skip_marker(infile, "POC",
-                  "POC in a tile is not supported yet",
-                  OJPH_MSG_LEVEL::WARN, resilient);
-              else if (marker_idx == 1)
-                result = skip_marker(infile, "PPT",
-                  "PPT in a tile is not supported yet",
-                  OJPH_MSG_LEVEL::WARN, resilient);
-              else if (marker_idx == 2)
-                //Skipping PLT marker segment;this should not cause any issues
-                result = skip_marker(infile, "PLT", NULL,
-                  OJPH_MSG_LEVEL::NO_MSG, resilient);
-              else if (marker_idx == 3)
-                result = skip_marker(infile, "COM", NULL,
-                  OJPH_MSG_LEVEL::NO_MSG, resilient);
-              else if (marker_idx == 4)
-              {
-                sod_found = true;
-                break;
-              }
-
-              if (marker_idx == -1) //marker not found
-              {
-                if (resilient)
-                  OJPH_INFO(0x00030063,
-                    "File terminated early before start of data is found"
-                    " for tile indexed %d and tile part %d",
-                    sot.get_tile_index(), sot.get_tile_part_index())
-                else
-                  OJPH_ERROR(0x00030063,
-                    "File terminated early before start of data is found"
-                    " for tile indexed %d and tile part %d",
-                    sot.get_tile_index(), sot.get_tile_part_index())
-                break;
-              }
-              if (result == -1) //file terminated during marker seg. skipping
+          if (!skip_tile)
+          {
+            if (sot.get_tile_part_index())
+            { //tile part
+              if (sot.get_num_tile_parts() &&
+                sot.get_tile_part_index() >= sot.get_num_tile_parts())
               {
                 if (resilient)
-                  OJPH_INFO(0x00030064,
-                    "File terminated during marker segment skipping")
+                  OJPH_INFO(0x00030062,
+                    "error in tile part number, should be smaller than total"
+                    " number of tile parts")
                 else
-                  OJPH_ERROR(0x00030064,
-                    "File terminated during marker segment skipping")
-                break;
-              }
-            }
-            if (sod_found)
-              tiles[sot.get_tile_index()].parse_tile_header(sot, infile,
-                tile_start_location);
-          }
-          else
-          { //first tile part
-            bool sod_found = false;
-            ui16 first_tile_part_markers[11] = { SOT, COD, COC, QCD, QCC, RGN,
-              POC, PPT, PLT, COM, SOD };
-            while (true)
-            {
-              int marker_idx = 0;
-              int result = 0;
-              marker_idx = find_marker(infile, first_tile_part_markers+1, 10);
-              if (marker_idx == 0)
-                result = skip_marker(infile, "COD",
-                  "COD in a tile is not supported yet",
-                  OJPH_MSG_LEVEL::WARN, resilient);
-              else if (marker_idx == 1)
-                result = skip_marker(infile, "COC",
-                  "COC in a tile is not supported yet",
-                  OJPH_MSG_LEVEL::WARN, resilient);
-              else if (marker_idx == 2)
-                result = skip_marker(infile, "QCD",
-                  "QCD in a tile is not supported yet",
-                  OJPH_MSG_LEVEL::WARN, resilient);
-              else if (marker_idx == 3)
-                result = skip_marker(infile, "QCC",
-                  "QCC in a tile is not supported yet",
-                  OJPH_MSG_LEVEL::WARN, resilient);
-              else if (marker_idx == 4)
-                result = skip_marker(infile, "RGN",
-                  "RGN in a tile is not supported yet",
-                  OJPH_MSG_LEVEL::WARN, resilient);
-              else if (marker_idx == 5)
-                result = skip_marker(infile, "POC",
-                  "POC in a tile is not supported yet",
-                  OJPH_MSG_LEVEL::WARN, resilient);
-              else if (marker_idx == 6)
-                result = skip_marker(infile, "PPT",
-                  "PPT in a tile is not supported yet",
-                  OJPH_MSG_LEVEL::WARN, resilient);
-              else if (marker_idx == 7)
-                //Skipping PLT marker segment;this should not cause any issues
-                result = skip_marker(infile, "PLT", NULL,
-                  OJPH_MSG_LEVEL::NO_MSG, resilient);
-              else if (marker_idx == 8)
-                result = skip_marker(infile, "COM", NULL,
-                  OJPH_MSG_LEVEL::NO_MSG, resilient);
-              else if (marker_idx == 9)
-              {
-                sod_found = true;
-                break;
+                  OJPH_ERROR(0x00030062,
+                    "error in tile part number, should be smaller than total"
+                    " number of tile parts")
               }
 
-              if (marker_idx == -1) //marker not found
+              bool sod_found = false;
+              ui16 other_tile_part_markers[7] = { SOT, POC, PPT, PLT, COM,
+                NLT, SOD };
+              while (true)
               {
-                if (resilient)
-                  OJPH_INFO(0x00030065,
-                    "File terminated early before start of data is found"
-                    " for tile indexed %d and tile part %d",
-                    sot.get_tile_index(), sot.get_tile_part_index())
-                else
-                  OJPH_ERROR(0x00030065,
-                    "File terminated early before start of data is found"
-                    " for tile indexed %d and tile part %d",
-                    sot.get_tile_index(), sot.get_tile_part_index())
-                break;
+                int marker_idx = 0;
+                int result = 0;
+                marker_idx = find_marker(infile, other_tile_part_markers+1, 6);
+                if (marker_idx == 0)
+                  result = skip_marker(infile, "POC",
+                    "POC marker segment in a tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 1)
+                  result = skip_marker(infile, "PPT",
+                    "PPT marker segment in a tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 2)
+                  //Skipping PLT marker segment;this should not cause any issues
+                  result = skip_marker(infile, "PLT", NULL,
+                    OJPH_MSG_NO_MSG, resilient);
+                else if (marker_idx == 3)
+                  result = skip_marker(infile, "COM", NULL,
+                    OJPH_MSG_NO_MSG, resilient);
+                else if (marker_idx == 4)
+                  result = skip_marker(infile, "NLT",
+                    "NLT marker in tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 5)
+                {
+                  sod_found = true;
+                  break;
+                }
+
+                if (marker_idx == -1) //marker not found
+                {
+                  if (resilient)
+                    OJPH_INFO(0x00030063,
+                      "File terminated early before start of data is found"
+                      " for tile indexed %d and tile part %d",
+                      sot.get_tile_index(), sot.get_tile_part_index())
+                  else
+                    OJPH_ERROR(0x00030063,
+                      "File terminated early before start of data is found"
+                      " for tile indexed %d and tile part %d",
+                      sot.get_tile_index(), sot.get_tile_part_index())
+                  break;
+                }
+                if (result == -1) //file terminated during marker seg. skipping
+                {
+                  if (resilient)
+                    OJPH_INFO(0x00030064,
+                      "File terminated during marker segment skipping")
+                  else
+                    OJPH_ERROR(0x00030064,
+                      "File terminated during marker segment skipping")
+                  break;
+                }
               }
-              if (result == -1) //file terminated during marker seg. skipping
+              if (sod_found)
+                tiles[sot.get_tile_index()].parse_tile_header(sot, infile,
+                  tile_start_location);
+            }
+            else
+            { //first tile part
+              bool sod_found = false;
+              ui16 first_tile_part_markers[12] = { SOT, COD, COC, QCD, QCC, RGN,
+                POC, PPT, PLT, COM, NLT, SOD };
+              while (true)
               {
-                if (resilient)
-                  OJPH_INFO(0x00030066,
-                    "File terminated during marker segment skipping")
-                else
-                  OJPH_ERROR(0x00030066,
-                    "File terminated during marker segment skipping")
-                break;
+                int marker_idx = 0;
+                int result = 0;
+                marker_idx = find_marker(infile, first_tile_part_markers+1, 11);
+                if (marker_idx == 0)
+                  result = skip_marker(infile, "COD",
+                    "COD marker segment in a tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 1)
+                  result = skip_marker(infile, "COC",
+                    "COC marker segment in a tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 2)
+                  result = skip_marker(infile, "QCD",
+                    "QCD marker segment in a tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 3)
+                  result = skip_marker(infile, "QCC",
+                    "QCC marker segment in a tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 4)
+                  result = skip_marker(infile, "RGN",
+                    "RGN marker segment in a tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 5)
+                  result = skip_marker(infile, "POC",
+                    "POC marker segment in a tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 6)
+                  result = skip_marker(infile, "PPT",
+                    "PPT marker segment in a tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 7)
+                  //Skipping PLT marker segment;this should not cause any issues
+                  result = skip_marker(infile, "PLT", NULL,
+                    OJPH_MSG_NO_MSG, resilient);
+                else if (marker_idx == 8)
+                  result = skip_marker(infile, "COM", NULL,
+                    OJPH_MSG_NO_MSG, resilient);
+                else if (marker_idx == 9)
+                  result = skip_marker(infile, "NLT",
+                    "PPT marker segment in a tile is not supported yet",
+                    OJPH_MSG_WARN, resilient);
+                else if (marker_idx == 10)
+                {
+                  sod_found = true;
+                  break;
+                }
+
+                if (marker_idx == -1) //marker not found
+                {
+                  if (resilient)
+                    OJPH_INFO(0x00030065,
+                      "File terminated early before start of data is found"
+                      " for tile indexed %d and tile part %d",
+                      sot.get_tile_index(), sot.get_tile_part_index())
+                  else
+                    OJPH_ERROR(0x00030065,
+                      "File terminated early before start of data is found"
+                      " for tile indexed %d and tile part %d",
+                      sot.get_tile_index(), sot.get_tile_part_index())
+                  break;
+                }
+                if (result == -1) //file terminated during marker seg. skipping
+                {
+                  if (resilient)
+                    OJPH_INFO(0x00030066,
+                      "File terminated during marker segment skipping")
+                  else
+                    OJPH_ERROR(0x00030066,
+                      "File terminated during marker segment skipping")
+                  break;
+                }
               }
+              if (sod_found)
+                tiles[sot.get_tile_index()].parse_tile_header(sot, infile,
+                  tile_start_location);
             }
-            if (sod_found)
-              tiles[sot.get_tile_index()].parse_tile_header(sot, infile,
-                tile_start_location);
           }
         }
 
@@ -1058,7 +1118,7 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void codestream::set_tilepart_divisions(ui32 value)
     {
-      tilepart_div = value;      
+      tilepart_div = value;
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -1195,4 +1255,4 @@ namespace ojph {
     }
 
   }
-}
\ No newline at end of file
+}
diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h
index 5e0bbfaf..2776f602 100644
--- a/src/core/codestream/ojph_codestream_local.h
+++ b/src/core/codestream/ojph_codestream_local.h
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -46,7 +46,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_fixed_allocator;
   class mem_elastic_allocator;
   class codestream;
@@ -68,30 +68,32 @@ namespace ojph {
     class codestream
     {
       friend ::ojph::codestream;
-      
+
     public:
       codestream();
       ~codestream();
 
+      void restart();
+
       void pre_alloc();
       void finalize_alloc();
 
-      ojph::param_siz access_siz()            //return externally wrapped siz
+      ojph::param_siz access_siz()            // returns externally wrapped siz
       { return ojph::param_siz(&siz); }
-      const param_siz* get_siz() //return internal siz
+      const param_siz* get_siz()              // returns internal siz
       { return &siz; }
-      ojph::param_cod access_cod()            //return externally wrapped cod
+      ojph::param_cod access_cod()            // returns externally wrapped cod
       { return ojph::param_cod(&cod); }
-      const param_cod* get_cod() //return internal code
+      const param_cod* get_cod()              // returns internal cod
       { return &cod; }
-      param_qcd* access_qcd(ui32 comp_num)
-      { 
-        if (used_qcc_fields > 0)
-          for (int v = 0; v < used_qcc_fields; ++v)
-            if (qcc[v].get_comp_num() == comp_num)
-              return qcc + v;
-        return &qcd; 
-      }
+      const param_cod* get_coc(ui32 comp_num) // returns internal cod
+      { return cod.get_coc(comp_num); }
+      const param_qcd* access_qcd()
+      { return &qcd; }
+      const param_dfs* access_dfs()
+      { if (dfs.exists()) return &dfs; else return NULL; }
+      const param_nlt* get_nlt()
+      { return &nlt; }
       mem_fixed_allocator* get_allocator() { return allocator; }
       mem_elastic_allocator* get_elastic_alloc() { return elastic_alloc; }
       outfile_base* get_file() { return outfile; }
@@ -148,20 +150,20 @@ namespace ojph {
       bool employ_color_transform;
       int planar;
       int profile;
-      ui32 tilepart_div;    // tilepart division value
-      bool need_tlm;       // true if tlm markers are needed
-      
+      ui32 tilepart_div;     // tilepart division value
+      bool need_tlm;         // true if tlm markers are needed
+
     private:
-      param_siz siz;
-      param_cod cod;
-      param_cap cap;
-      param_qcd qcd;
-      param_tlm tlm;
-
-    private: // this is to handle qcc
-      int used_qcc_fields;
-      param_qcc qcc_store[4], *qcc; // we allocate 4, 
-                                    // if not enough, we allocate more
+      param_siz siz;         // image and tile size
+      param_cod cod;         // coding style default
+      param_cap cap;         // extended capabilities
+      param_qcd qcd;         // quantization default
+      param_tlm tlm;         // tile-part lengths
+      param_nlt nlt;         // non-linearity point transformation
+
+    private:  // these are from Part 2 of the standard
+      param_dfs dfs;         // downsmapling factor styles
+      param_atk atk;         // wavelet structure and coefficients
 
     private:
       mem_fixed_allocator *allocator;
diff --git a/src/core/codestream/ojph_codestream_sse.cpp b/src/core/codestream/ojph_codestream_sse.cpp
index 7c64ad93..a4b8d783 100644
--- a/src/core/codestream/ojph_codestream_sse.cpp
+++ b/src/core/codestream/ojph_codestream_sse.cpp
@@ -35,6 +35,9 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
 #include <immintrin.h>
 #include "ojph_defs.h"
 
@@ -49,6 +52,7 @@ namespace ojph {
       for (size_t i = 0; i < count; i += 16, p += 4)
         _mm_storeu_ps(p, zero);
     }
-
   }
-}
\ No newline at end of file
+}
+
+#endif
diff --git a/src/core/codestream/ojph_codestream_sse2.cpp b/src/core/codestream/ojph_codestream_sse2.cpp
index 9bb06434..f60a9aaf 100644
--- a/src/core/codestream/ojph_codestream_sse2.cpp
+++ b/src/core/codestream/ojph_codestream_sse2.cpp
@@ -35,6 +35,10 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
+#include <climits>
 #include <immintrin.h>
 #include "ojph_defs.h"
 
@@ -42,7 +46,7 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 sse2_find_max_val(ui32* address)
+    ui32 sse2_find_max_val32(ui32* address)
     {
       __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
       x1 = _mm_shuffle_epi32(x0, 0xEE);   // x1 = x0[2,3,2,3]
@@ -59,19 +63,34 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                           float delta_inv, ui32 count, ui32* max_val)
+    ui64 sse2_find_max_val64(ui64* address)
+    {
+      __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
+      x1 = _mm_shuffle_epi32(x0, 0xEE);   // x1 = x0[2,3,2,3]
+      x0 = _mm_or_si128(x0, x1);
+      _mm_storeu_si128((__m128i*)address, x0);
+      return *address;
+      // A single movd t, xmm0 can do the trick, but it is not available
+      // in SSE2 intrinsics. extract_epi32 is available in sse4.1
+      // ui32 t = (ui32)_mm_extract_epi16(x0, 0);
+      // t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
+      // return t;
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(delta_inv);
 
       // convert to sign and magnitude and keep max_val      
       ui32 shift = 31 - K_max;
-      __m128i m0 = _mm_set1_epi32((int)0x80000000);
+      __m128i m0 = _mm_set1_epi32(INT_MIN);
       __m128i zero = _mm_setzero_si128();
       __m128i one = _mm_set1_epi32(1);
       __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
       __m128i *p = (__m128i*)sp;
-      for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
+      for ( ; count >= 4; count -= 4, p += 1, dp += 4)
       {
         __m128i v = _mm_loadu_si128(p);
         __m128i sign = _mm_cmplt_epi32(v, zero);
@@ -84,12 +103,31 @@ namespace ojph {
         val = _mm_or_si128(val, sign);
         _mm_storeu_si128((__m128i*)dp, val);
       }
+      if (count)
+      {
+        __m128i v = _mm_loadu_si128(p);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        __m128i val = _mm_xor_si128(v, sign); // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi32(val, ones);        // 2's complement
+        sign = _mm_and_si128(sign, m0);
+        val = _mm_slli_epi32(val, (int)shift);
+
+        __m128i c = _mm_set1_epi32((si32)count);
+        __m128i idx = _mm_set_epi32(3, 2, 1, 0);
+        __m128i mask = _mm_cmpgt_epi32(c, idx);
+        c = _mm_and_si128(val, mask);
+        tmax = _mm_or_si128(tmax, c);
+
+        val = _mm_or_si128(val, sign);
+        _mm_storeu_si128((__m128i*)dp, val);
+      }
       _mm_storeu_si128((__m128i*)max_val, tmax);
     }
                            
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val)
+    void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(K_max);
 
@@ -100,7 +138,7 @@ namespace ojph {
       __m128i one = _mm_set1_epi32(1);
       __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
       float *p = (float*)sp;
-      for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
+      for ( ; count >= 4; count -= 4, p += 4, dp += 4)
       {
         __m128 vf = _mm_loadu_ps(p);
         vf = _mm_mul_ps(vf, d);                    // multiply
@@ -114,38 +152,58 @@ namespace ojph {
         val = _mm_or_si128(val, sign);
         _mm_storeu_si128((__m128i*)dp, val);
       }
+      if (count)
+      {
+        __m128 vf = _mm_loadu_ps(p);
+        vf = _mm_mul_ps(vf, d);                    // multiply
+        __m128i val = _mm_cvtps_epi32(vf);         // convert to int
+        __m128i sign = _mm_cmplt_epi32(val, zero); // get sign
+        val = _mm_xor_si128(val, sign);            // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi32(val, ones);            // 2's complement
+
+        __m128i c = _mm_set1_epi32((si32)count);
+        __m128i idx = _mm_set_epi32(3, 2, 1, 0);
+        __m128i mask = _mm_cmpgt_epi32(c, idx);
+        c = _mm_and_si128(val, mask);
+        tmax = _mm_or_si128(tmax, c);
+
+        sign = _mm_slli_epi32(sign, 31);
+        val = _mm_or_si128(val, sign);
+        _mm_storeu_si128((__m128i*)dp, val);
+      }
       _mm_storeu_si128((__m128i*)max_val, tmax);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
-      __m128i m1 = _mm_set1_epi32(0x7FFFFFFF);
+      __m128i m1 = _mm_set1_epi32(INT_MAX);
       __m128i zero = _mm_setzero_si128();
       __m128i one = _mm_set1_epi32(1);
       si32 *p = (si32*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
       {
-          __m128i v = _mm_load_si128((__m128i*)sp);
-          __m128i val = _mm_and_si128(v, m1);
-          val = _mm_srli_epi32(val, (int)shift);
-          __m128i sign = _mm_cmplt_epi32(v, zero);
-          val = _mm_xor_si128(val, sign); // negate 1's complement
-          __m128i ones = _mm_and_si128(sign, one);
-          val = _mm_add_epi32(val, ones); // 2's complement
-          _mm_storeu_si128((__m128i*)p, val);
+        __m128i v = _mm_load_si128((__m128i*)sp);
+        __m128i val = _mm_and_si128(v, m1);
+        val = _mm_srli_epi32(val, (int)shift);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        val = _mm_xor_si128(val, sign); // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi32(val, ones); // 2's complement
+        _mm_storeu_si128((__m128i*)p, val);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(K_max);
-      __m128i m1 = _mm_set1_epi32(0x7FFFFFFF);
+      __m128i m1 = _mm_set1_epi32(INT_MAX);
       __m128 d = _mm_set1_ps(delta);
       float *p = (float*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
@@ -159,5 +217,79 @@ namespace ojph {
         _mm_storeu_ps(p, valf);
       }
     }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val      
+      ui32 shift = 63 - K_max;
+      __m128i m0 = _mm_set1_epi64x(LLONG_MIN);
+      __m128i zero = _mm_setzero_si128();
+      __m128i one = _mm_set1_epi64x(1);
+      __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
+      __m128i *p = (__m128i*)sp;
+      for ( ; count >= 2; count -= 2, p += 1, dp += 2)
+      {
+        __m128i v = _mm_loadu_si128(p);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        sign = _mm_shuffle_epi32(sign, 0xF5);  // sign = sign[1,1,3,3];
+        __m128i val = _mm_xor_si128(v, sign);  // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi64(val, ones);        // 2's complement
+        sign = _mm_and_si128(sign, m0);
+        val = _mm_slli_epi64(val, (int)shift);
+        tmax = _mm_or_si128(tmax, val);
+        val = _mm_or_si128(val, sign);
+        _mm_storeu_si128((__m128i*)dp, val);
+      }
+      if (count)
+      {
+        __m128i v = _mm_loadu_si128(p);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        sign = _mm_shuffle_epi32(sign, 0xF5);  // sign = sign[1,1,3,3];
+        __m128i val = _mm_xor_si128(v, sign);  // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi64(val, ones);        // 2's complement
+        sign = _mm_and_si128(sign, m0);
+        val = _mm_slli_epi64(val, (int)shift);
+
+        __m128i c = _mm_set_epi32(0, 0, (si32)0xFFFFFFFF, (si32)0xFFFFFFFF);
+        c = _mm_and_si128(val, c);
+        tmax = _mm_or_si128(tmax, c);
+
+        val = _mm_or_si128(val, sign);
+        _mm_storeu_si128((__m128i*)dp, val);
+      }
+      _mm_storeu_si128((__m128i*)max_val, tmax);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 63 - K_max;
+      __m128i m1 = _mm_set1_epi64x(LLONG_MAX);
+      __m128i zero = _mm_setzero_si128();
+      __m128i one = _mm_set1_epi64x(1);
+      si64 *p = (si64*)dp;
+      for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
+      {
+        __m128i v = _mm_load_si128((__m128i*)sp);
+        __m128i val = _mm_and_si128(v, m1);
+        val = _mm_srli_epi64(val, (int)shift);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        sign = _mm_shuffle_epi32(sign, 0xF5);      // sign = sign[1,1,3,3];
+        val = _mm_xor_si128(val, sign);            // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi64(val, ones);            // 2's complement
+        _mm_storeu_si128((__m128i*)p, val);
+      }
+    }
   }
-}
\ No newline at end of file
+}
+
+#endif
diff --git a/src/core/codestream/ojph_codestream_wasm.cpp b/src/core/codestream/ojph_codestream_wasm.cpp
index 19e47aa3..b65e35e9 100644
--- a/src/core/codestream/ojph_codestream_wasm.cpp
+++ b/src/core/codestream/ojph_codestream_wasm.cpp
@@ -35,6 +35,7 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include <climits>
 #include <cstddef> 
 #include <wasm_simd128.h>
 
@@ -43,20 +44,17 @@
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-  #define REPEAT(a) a,a,a,a
-
     //////////////////////////////////////////////////////////////////////////
     void wasm_mem_clear(void* addr, size_t count)
     {
       float* p = (float*)addr;
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
+      v128_t zero = wasm_i32x4_splat(0);
       for (size_t i = 0; i < count; i += 16, p += 4)
         wasm_v128_store(p, zero);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 wasm_find_max_val(ui32* address)
+    ui32 wasm_find_max_val32(ui32* address)
     {
       v128_t x1, x0 = wasm_v128_load(address);
       x1 = wasm_i32x4_shuffle(x0, x0, 2, 3, 2, 3);   // x1 = x0[2,3,2,3]
@@ -68,19 +66,29 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                           float delta_inv, ui32 count, ui32* max_val)
+    ui64 wasm_find_max_val64(ui64* address)
+    {
+      v128_t x1, x0 = wasm_v128_load(address);
+      x1 = wasm_i64x2_shuffle(x0, x0, 1, 1);   // x1 = x0[2,3,2,3]
+      x0 = wasm_v128_or(x0, x1);
+      ui64 t = (ui64)wasm_i64x2_extract_lane(x0, 0);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(delta_inv);
 
       // convert to sign and magnitude and keep max_val      
       ui32 shift = 31 - K_max;
-      v128_t m0 = wasm_i32x4_const(REPEAT((int)0x80000000));
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
-      v128_t one = wasm_i32x4_const(REPEAT(1));
+      v128_t m0 = wasm_i32x4_splat(INT_MIN);
+      v128_t zero = wasm_i32x4_splat(0);
+      v128_t one = wasm_i32x4_splat(1);
       v128_t tmax = wasm_v128_load(max_val);
-      v128_t *p = (v128_t*)sp;
-      for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
+      si32 *p = (si32*)sp;
+      for ( ; count >= 4; count -= 4, p += 4, dp += 4)
       {
         v128_t v = wasm_v128_load(p);
         v128_t sign = wasm_i32x4_lt(v, zero);
@@ -93,23 +101,42 @@ namespace ojph {
         val = wasm_v128_or(val, sign);
         wasm_v128_store(dp, val);
       }
+      if (count)
+      {
+        v128_t v = wasm_v128_load(p);
+        v128_t sign = wasm_i32x4_lt(v, zero);
+        v128_t val = wasm_v128_xor(v, sign); // negate 1's complement
+        v128_t ones = wasm_v128_and(sign, one);
+        val = wasm_i32x4_add(val, ones);     // 2's complement
+        sign = wasm_v128_and(sign, m0);
+        val = wasm_i32x4_shl(val, shift);
+
+        v128_t c = wasm_i32x4_splat((si32)count);
+        v128_t idx = wasm_i32x4_make(0, 1, 2, 3);
+        v128_t mask = wasm_i32x4_gt(c, idx);
+        c = wasm_v128_and(val, mask);
+        tmax = wasm_v128_or(tmax, c);
+
+        val = wasm_v128_or(val, sign);
+        wasm_v128_store(dp, val);
+      }
       wasm_v128_store(max_val, tmax);
     }
                            
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val)
+    void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(K_max);
 
       //quantize and convert to sign and magnitude and keep max_val
 
       v128_t d = wasm_f32x4_splat(delta_inv);
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
-      v128_t one = wasm_i32x4_const(REPEAT(1));
+      v128_t zero = wasm_i32x4_splat(0);
+      v128_t one = wasm_i32x4_splat(1);
       v128_t tmax = wasm_v128_load(max_val);
       float *p = (float*)sp;
-      for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
+      for ( ; count >= 4; count -= 4, p += 4, dp += 4)
       {
         v128_t vf = wasm_v128_load(p);
         vf = wasm_f32x4_mul(vf, d);                   // multiply
@@ -123,18 +150,38 @@ namespace ojph {
         val = wasm_v128_or(val, sign);
         wasm_v128_store(dp, val);
       }
+      if (count)
+      {
+        v128_t vf = wasm_v128_load(p);
+        vf = wasm_f32x4_mul(vf, d);                   // multiply
+        v128_t val = wasm_i32x4_trunc_sat_f32x4(vf);  // convert to signed int
+        v128_t sign = wasm_i32x4_lt(val, zero);       // get sign
+        val = wasm_v128_xor(val, sign);               // negate 1's complement
+        v128_t ones = wasm_v128_and(sign, one);
+        val = wasm_i32x4_add(val, ones);              // 2's complement
+
+        v128_t c = wasm_i32x4_splat((si32)count);
+        v128_t idx = wasm_i32x4_make(0, 1, 2, 3);
+        v128_t mask = wasm_i32x4_gt(c, idx);
+        c = wasm_v128_and(val, mask);
+        tmax = wasm_v128_or(tmax, c);
+
+        sign = wasm_i32x4_shl(sign, 31);
+        val = wasm_v128_or(val, sign);
+        wasm_v128_store(dp, val);
+      }
       wasm_v128_store(max_val, tmax);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
-      v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF));
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
-      v128_t one = wasm_i32x4_const(REPEAT(1));
+      v128_t m1 = wasm_i32x4_splat(INT_MAX);
+      v128_t zero = wasm_i32x4_splat(0);
+      v128_t one = wasm_i32x4_splat(1);
       si32 *p = (si32*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
       {
@@ -150,11 +197,11 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(K_max);
-      v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF));
+      v128_t m1 = wasm_i32x4_splat(INT_MAX);
       v128_t d = wasm_f32x4_splat(delta);
       float *p = (float*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
@@ -167,6 +214,76 @@ namespace ojph {
         valf = wasm_v128_or(valf, sign);
         wasm_v128_store(p, valf);
       }
-    }  
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val      
+      ui32 shift = 63 - K_max;
+      v128_t m0 = wasm_i64x2_splat(LLONG_MIN);
+      v128_t zero = wasm_i64x2_splat(0);
+      v128_t one = wasm_i64x2_splat(1);
+      v128_t tmax = wasm_v128_load(max_val);
+      si64 *p = (si64*)sp;
+      for ( ; count >= 2; count -= 2, p += 2, dp += 2)
+      {
+        v128_t v = wasm_v128_load(p);
+        v128_t sign = wasm_i64x2_lt(v, zero);
+        v128_t val = wasm_v128_xor(v, sign); // negate 1's complement
+        v128_t ones = wasm_v128_and(sign, one);
+        val = wasm_i64x2_add(val, ones);     // 2's complement
+        sign = wasm_v128_and(sign, m0);
+        val = wasm_i64x2_shl(val, shift);
+        tmax = wasm_v128_or(tmax, val);
+        val = wasm_v128_or(val, sign);
+        wasm_v128_store(dp, val);
+      }
+      if (count)
+      {
+        v128_t v = wasm_v128_load(p);
+        v128_t sign = wasm_i64x2_lt(v, zero);
+        v128_t val = wasm_v128_xor(v, sign); // negate 1's complement
+        v128_t ones = wasm_v128_and(sign, one);
+        val = wasm_i64x2_add(val, ones);     // 2's complement
+        sign = wasm_v128_and(sign, m0);
+        val = wasm_i64x2_shl(val, shift);
+
+        v128_t c = wasm_i32x4_make((si32)0xFFFFFFFF, (si32)0xFFFFFFFF, 0, 0);
+        c = wasm_v128_and(val, c);
+        tmax = wasm_v128_or(tmax, c);
+
+        val = wasm_v128_or(val, sign);
+        wasm_v128_store(dp, val);
+      }
+
+      wasm_v128_store(max_val, tmax);
+    }   
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 63 - K_max;
+      v128_t m1 = wasm_i64x2_splat(LLONG_MAX);
+      v128_t zero = wasm_i64x2_splat(0);
+      v128_t one = wasm_i64x2_splat(1);
+      si64 *p = (si64*)dp;
+      for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
+      {
+          v128_t v = wasm_v128_load((v128_t*)sp);
+          v128_t val = wasm_v128_and(v, m1);
+          val = wasm_i64x2_shr(val, shift);
+          v128_t sign = wasm_i64x2_lt(v, zero);
+          val = wasm_v128_xor(val, sign); // negate 1's complement
+          v128_t ones = wasm_v128_and(sign, one);
+          val = wasm_i64x2_add(val, ones); // 2's complement
+          wasm_v128_store(p, val);
+      }
+    }
   }
 }
\ No newline at end of file
diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index 6fe5e567..fc841c44 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -58,29 +58,25 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   void param_siz::set_image_extent(point dims)
   {
-    state->Xsiz = dims.x;
-    state->Ysiz = dims.y;
+    state->set_image_extent(dims);
   }
 
   ////////////////////////////////////////////////////////////////////////////
   void param_siz::set_tile_size(size s)
   {
-    state->XTsiz = s.w;
-    state->YTsiz = s.h;
+    state->set_tile_size(s);
   }
 
   ////////////////////////////////////////////////////////////////////////////
   void param_siz::set_image_offset(point offset)
-  { // WARNING need to check if these are valid
-    state->XOsiz = offset.x;
-    state->YOsiz = offset.y;
+  {
+    state->set_image_offset(offset);
   }
 
   ////////////////////////////////////////////////////////////////////////////
   void param_siz::set_tile_offset(point offset)
-  { // WARNING need to check if these are valid
-    state->XTOsiz = offset.x;
-    state->YTOsiz = offset.y;
+  {
+    state->set_tile_offset(offset);
   }
 
   ////////////////////////////////////////////////////////////////////////////
@@ -254,6 +250,15 @@ namespace ojph {
     state->set_reversible(reversible);
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  param_coc param_cod::get_coc(ui32 component_idx)
+  {
+    local::param_cod *p = state->get_coc(component_idx);
+    if (p == state) // no COC segment marker for this component
+      p = state->add_coc_object(component_idx);
+    return param_coc(p);
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   ui32 param_cod::get_num_decompositions() const
   {
@@ -341,9 +346,61 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   bool param_cod::get_block_vertical_causality() const
   {
-    return (state->SPcod.block_style & local::param_cod::VERT_CAUSAL_MODE)!=0;
+    return state->get_block_vertical_causality();
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////////////////////////////////////////////////////////
+  void param_coc::set_num_decomposition(ui32 num_decompositions)
+  { ojph::param_cod(state).set_num_decomposition(num_decompositions); }
+
+  ////////////////////////////////////////////////////////////////////////////
+  void param_coc::set_block_dims(ui32 width, ui32 height)
+  { ojph::param_cod(state).set_block_dims(width, height); }
+
+  ////////////////////////////////////////////////////////////////////////////
+  void param_coc::set_precinct_size(int num_levels, size* precinct_size)
+  { ojph::param_cod(state).set_precinct_size(num_levels, precinct_size); }
+
+  ////////////////////////////////////////////////////////////////////////////
+  void param_coc::set_reversible(bool reversible)
+  { ojph::param_cod(state).set_reversible(reversible); }
+
+  ////////////////////////////////////////////////////////////////////////////
+  ui32 param_coc::get_num_decompositions() const
+  { return ojph::param_cod(state).get_num_decompositions(); }
+
+  ////////////////////////////////////////////////////////////////////////////
+  size param_coc::get_block_dims() const
+  { return ojph::param_cod(state).get_block_dims(); }
+
+  ////////////////////////////////////////////////////////////////////////////
+  size param_coc::get_log_block_dims() const
+  { return ojph::param_cod(state).get_log_block_dims(); }
+
+  ////////////////////////////////////////////////////////////////////////////
+  bool param_coc::is_reversible() const
+  { return ojph::param_cod(state).is_reversible(); }
+
+  ////////////////////////////////////////////////////////////////////////////
+  size param_coc::get_precinct_size(ui32 level_num) const
+  { return ojph::param_cod(state).get_precinct_size(level_num); }
+
+  ////////////////////////////////////////////////////////////////////////////
+  size param_coc::get_log_precinct_size(ui32 level_num) const
+  { return ojph::param_cod(state).get_log_precinct_size(level_num); }
+
+  ////////////////////////////////////////////////////////////////////////////
+  bool param_coc::get_block_vertical_causality() const
+  { return ojph::param_cod(state).get_block_vertical_causality(); }
+
 
   ////////////////////////////////////////////////////////////////////////////
   //
@@ -359,6 +416,34 @@ namespace ojph {
     state->set_delta(delta);
   }
 
+  //////////////////////////////////////////////////////////////////////////
+  void param_qcd::set_irrev_quant(ui32 comp_idx, float delta)
+  {
+    state->set_delta(comp_idx, delta);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////////////////////////////////////////////////////////
+  void param_nlt::set_nonlinear_transform(ui32 comp_num, ui8 nl_type)
+  {
+    state->set_nonlinear_transform(comp_num, nl_type);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  bool param_nlt::get_nonlinear_transform(ui32 comp_num, ui8& bit_depth,
+                                          bool& is_signed, ui8& nl_type) const
+  {
+    return state->get_nonlinear_transform(comp_num, bit_depth, is_signed,
+                                          nl_type);
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   //
   //
@@ -368,26 +453,26 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
 
   //////////////////////////////////////////////////////////////////////////
-  void comment_exchange::set_string(char* str)
-  { 
+  void comment_exchange::set_string(const char* str)
+  {
     size_t t = strlen(str);
     if (len > 65531)
-      OJPH_ERROR(0x000500C1, 
+      OJPH_ERROR(0x000500C1,
         "COM marker string length cannot be larger than 65531");
-    this->data = str; 
+    this->data = str;
     this->len = (ui16)t;
     this->Rcom = 1;
   }
 
   //////////////////////////////////////////////////////////////////////////
-  void comment_exchange::set_data(char* data, ui16 len)
-  { 
+  void comment_exchange::set_data(const char* data, ui16 len)
+  {
     if (len > 65531)
-      OJPH_ERROR(0x000500C2, 
+      OJPH_ERROR(0x000500C2,
         "COM marker string length cannot be larger than 65531");
     this->data = data;
-    this->len = len; 
-    this->Rcom = 0; 
+    this->len = len;
+    this->Rcom = 0;
   }
 
   //////////////////////////////////////////////////////////////////////////
@@ -417,6 +502,16 @@ namespace ojph {
       return u;
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    ui64 swap_byte(ui64 t)
+    {
+      ui64 u = swap_byte((ui32)(t & 0xFFFFFFFFu));
+      u <<= 32;
+      u |= swap_byte((ui32)(t >> 32));
+      return u;
+    }
+
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -594,8 +689,9 @@ namespace ojph {
         OJPH_ERROR(0x00050043, "error reading SIZ marker");
       Rsiz = swap_byte(Rsiz);
       if ((Rsiz & 0x4000) == 0)
-        OJPH_ERROR(0x00050044, "Rsiz bit 14 not set (this is not a JPH file)");
-      if (Rsiz & 0xBFFF)
+        OJPH_ERROR(0x00050044,
+          "Rsiz bit 14 is not set (this is not a JPH file)");
+      if ((Rsiz & 0x8000) != 0 && (Rsiz & 0xD5F) != 0)
         OJPH_WARN(0x00050001, "Rsiz in SIZ has unimplemented fields");
       if (file->read(&Xsiz, 4) != 4)
         OJPH_ERROR(0x00050045, "error reading SIZ marker");
@@ -603,36 +699,30 @@ namespace ojph {
       if (file->read(&Ysiz, 4) != 4)
         OJPH_ERROR(0x00050046, "error reading SIZ marker");
       Ysiz = swap_byte(Ysiz);
-      if (file->read(&XOsiz, 4) != 4)
+      ui32 t_XOsiz, t_YOsiz;
+      if (file->read(&t_XOsiz, 4) != 4)
         OJPH_ERROR(0x00050047, "error reading SIZ marker");
-      XOsiz = swap_byte(XOsiz);
-      if (file->read(&YOsiz, 4) != 4)
+      if (file->read(&t_YOsiz, 4) != 4)
         OJPH_ERROR(0x00050048, "error reading SIZ marker");
-      YOsiz = swap_byte(YOsiz);
-      if (file->read(&XTsiz, 4) != 4)
+      set_image_offset(point(swap_byte(t_XOsiz), swap_byte(t_YOsiz)));
+      ui32 t_XTsiz, t_YTsiz;
+      if (file->read(&t_XTsiz, 4) != 4)
         OJPH_ERROR(0x00050049, "error reading SIZ marker");
-      XTsiz = swap_byte(XTsiz);
-      if (file->read(&YTsiz, 4) != 4)
+      if (file->read(&t_YTsiz, 4) != 4)
         OJPH_ERROR(0x0005004A, "error reading SIZ marker");
-      YTsiz = swap_byte(YTsiz);
-      if (file->read(&XTOsiz, 4) != 4)
+      set_tile_size(size(swap_byte(t_XTsiz), swap_byte(t_YTsiz)));
+      ui32 t_XTOsiz, t_YTOsiz;
+      if (file->read(&t_XTOsiz, 4) != 4)
         OJPH_ERROR(0x0005004B, "error reading SIZ marker");
-      XTOsiz = swap_byte(XTOsiz);
-      if (file->read(&YTOsiz, 4) != 4)
+      if (file->read(&t_YTOsiz, 4) != 4)
         OJPH_ERROR(0x0005004C, "error reading SIZ marker");
-      YTOsiz = swap_byte(YTOsiz);
+      set_tile_offset(point(swap_byte(t_XTOsiz), swap_byte(t_YTOsiz)));
       if (file->read(&Csiz, 2) != 2)
         OJPH_ERROR(0x0005004D, "error reading SIZ marker");
       Csiz = swap_byte(Csiz);
       if (Csiz != num_comps)
         OJPH_ERROR(0x0005004E, "Csiz does not match the SIZ marker size");
-      if (Csiz > old_Csiz)
-      {
-        if (cptr != store)
-          delete[] cptr;
-        cptr = new siz_comp_info[(ui32)num_comps];
-        old_Csiz = Csiz;
-      }
+      set_num_components(Csiz);
       for (int c = 0; c < Csiz; ++c)
       {
         if (file->read(&cptr[c].SSiz, 1) != 1)
@@ -641,9 +731,49 @@ namespace ojph {
           OJPH_ERROR(0x00050052, "error reading SIZ marker");
         if (file->read(&cptr[c].YRsiz, 1) != 1)
           OJPH_ERROR(0x00050053, "error reading SIZ marker");
+        if ((cptr[c].SSiz & 0x7F) > 37)
+          OJPH_ERROR(0x00050054, "Wrong SIZ-SSiz value of %d", cptr[c].SSiz);
+        if (cptr[c].XRsiz == 0)
+          OJPH_ERROR(0x00050055, "Wrong SIZ-XRsiz value of %d", cptr[c].XRsiz);
+        if (cptr[c].YRsiz == 0)
+          OJPH_ERROR(0x00050056, "Wrong SIZ-YRsiz value of %d", cptr[c].YRsiz);
+      }
+
+      ws_kern_support_needed = (Rsiz & 0x20) != 0;
+      dfs_support_needed = (Rsiz & 0x80) != 0;
+
+      check_validity();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    point param_siz::get_recon_downsampling(ui32 comp_num) const
+    {
+      assert(comp_num < get_num_components());
+
+      point factor(1u << skipped_resolutions, 1u << skipped_resolutions);
+      const param_cod* cdp = cod->get_coc(comp_num);
+      if (dfs && cdp && cdp->is_dfs_defined()) {
+        const param_dfs* d = dfs->get_dfs(cdp->get_dfs_index());
+        factor = d->get_res_downsamp(skipped_resolutions);
       }
+      factor.x *= (ui32)cptr[comp_num].XRsiz;
+      factor.y *= (ui32)cptr[comp_num].YRsiz;
+      return factor;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    point param_siz::get_recon_size(ui32 comp_num) const
+    {
+      assert(comp_num < get_num_components());
+
+      point factor = get_recon_downsampling(comp_num);
+      point r;
+      r.x = ojph_div_ceil(Xsiz, factor.x) - ojph_div_ceil(XOsiz, factor.x);
+      r.y = ojph_div_ceil(Ysiz, factor.y) - ojph_div_ceil(YOsiz, factor.y);
+      return r;
     }
 
+
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -707,9 +837,22 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
+    //////////////////////////////////////////////////////////////////////////
+    bool param_cod::is_reversible() const
+    {
+      if (SPcod.wavelet_trans <= 1)
+        return get_wavelet_kern() == local::param_cod::DWT_REV53;
+      else {
+        assert(atk != NULL);
+        return atk->is_reversible();
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     bool param_cod::write(outfile_base *file)
     {
+      assert(type == COD_MAIN);
+
       //marker size excluding header
       Lcod = 12;
       Lcod = (ui16)(Lcod + (Scod & 1 ? 1 + SPcod.num_decomp : 0));
@@ -747,38 +890,238 @@ namespace ojph {
       return result;
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    bool param_cod::write_coc(outfile_base *file, ui32 num_comps)
+    {
+      assert(type == COD_MAIN);
+      bool result = true;
+      param_cod *p = this->next;
+      while (p)
+      {
+        if (p->comp_idx < num_comps)
+          result &= p->internal_write_coc(file, num_comps);
+        p = p->next;
+      }
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_cod::internal_write_coc(outfile_base *file, ui32 num_comps)
+    {
+      assert(type == COC_MAIN);
+
+      //marker size excluding header
+      Lcod = num_comps < 257 ? 9 : 10;
+      Lcod = (ui16)(Lcod + (Scod & 1 ? 1 + SPcod.num_decomp : 0));
+
+      ui8 buf[4];
+      bool result = true;
+
+      *(ui16*)buf = JP2K_MARKER::COC;
+      *(ui16*)buf = swap_byte(*(ui16*)buf);
+      result &= file->write(&buf, 2) == 2;
+      *(ui16*)buf = swap_byte(Lcod);
+      result &= file->write(&buf, 2) == 2;
+      if (num_comps < 257)
+      {
+        *(ui8*)buf = (ui8)comp_idx;
+        result &= file->write(&buf, 1) == 1;
+      }
+      else
+      {
+        *(ui16*)buf = swap_byte(comp_idx);
+        result &= file->write(&buf, 2) == 2;
+      }
+      *(ui8*)buf = Scod;
+      result &= file->write(&buf, 1) == 1;
+      buf[0] = SPcod.num_decomp;
+      buf[1] = SPcod.block_width;
+      buf[2] = SPcod.block_height;
+      buf[3] = SPcod.block_style;
+      result &= file->write(&buf, 4) == 4;
+      *(ui8*)buf = SPcod.wavelet_trans;
+      result &= file->write(&buf, 1) == 1;
+      if (Scod & 1)
+        for (int i = 0; i <= SPcod.num_decomp; ++i)
+        {
+          *(ui8*)buf = SPcod.precinct_size[i];
+          result &= file->write(&buf, 1) == 1;
+        }
+
+      return result;
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void param_cod::read(infile_base *file)
     {
+      assert(type == COD_MAIN);
+
       if (file->read(&Lcod, 2) != 2)
-        OJPH_ERROR(0x00050071, "error reading COD marker");
+        OJPH_ERROR(0x00050071, "error reading COD segment");
       Lcod = swap_byte(Lcod);
       if (file->read(&Scod, 1) != 1)
-        OJPH_ERROR(0x00050072, "error reading COD marker");
+        OJPH_ERROR(0x00050072, "error reading COD segment");
       if (file->read(&SGCod.prog_order, 1) != 1)
-        OJPH_ERROR(0x00050073, "error reading COD marker");
+        OJPH_ERROR(0x00050073, "error reading COD segment");
       if (file->read(&SGCod.num_layers, 2) != 2)
-      { OJPH_ERROR(0x00050074, "error reading COD marker"); }
+      { OJPH_ERROR(0x00050074, "error reading COD segment"); }
       else
         SGCod.num_layers = swap_byte(SGCod.num_layers);
       if (file->read(&SGCod.mc_trans, 1) != 1)
-        OJPH_ERROR(0x00050075, "error reading COD marker");
+        OJPH_ERROR(0x00050075, "error reading COD segment");
       if (file->read(&SPcod.num_decomp, 1) != 1)
-        OJPH_ERROR(0x00050076, "error reading COD marker");
+        OJPH_ERROR(0x00050076, "error reading COD segment");
       if (file->read(&SPcod.block_width, 1) != 1)
-        OJPH_ERROR(0x00050077, "error reading COD marker");
+        OJPH_ERROR(0x00050077, "error reading COD segment");
       if (file->read(&SPcod.block_height, 1) != 1)
-        OJPH_ERROR(0x00050078, "error reading COD marker");
+        OJPH_ERROR(0x00050078, "error reading COD segment");
       if (file->read(&SPcod.block_style, 1) != 1)
-        OJPH_ERROR(0x00050079, "error reading COD marker");
+        OJPH_ERROR(0x00050079, "error reading COD segment");
       if (file->read(&SPcod.wavelet_trans, 1) != 1)
-        OJPH_ERROR(0x0005007A, "error reading COD marker");
+        OJPH_ERROR(0x0005007A, "error reading COD segment");
+
+      if (get_num_decompositions() > 32
+        || SPcod.block_width > 8
+        || SPcod.block_height > 8
+        || SPcod.block_width + SPcod.block_height > 8
+        || (SPcod.block_style & 0x40) != 0x40
+        || (SPcod.block_style & 0xB7) != 0x00)
+        OJPH_ERROR(0x0005007D, "wrong settings in a COD-SPcod parameter");
+      if ((SPcod.block_style & 0x40) != 0x40
+        || (SPcod.block_style & 0xB7) != 0x00)
+        OJPH_ERROR(0x0005007E, "unsupported settings in a COD-SPcod parameter");
+
+      ui8 num_decompositions =  get_num_decompositions();
       if (Scod & 1)
-        for (int i = 0; i <= SPcod.num_decomp; ++i)
+        for (int i = 0; i <= num_decompositions; ++i)
           if (file->read(&SPcod.precinct_size[i], 1) != 1)
-            OJPH_ERROR(0x0005007B, "error reading COD marker");
+            OJPH_ERROR(0x0005007B, "error reading COD segment");
       if (Lcod != 12 + ((Scod & 1) ? 1 + SPcod.num_decomp : 0))
-        OJPH_ERROR(0x0005007C, "error in COD marker length");
+        OJPH_ERROR(0x0005007C, "error in COD segment length");
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_cod::read_coc(infile_base* file, ui32 num_comps,
+                             param_cod *top_cod)
+    {
+      assert(type == COC_MAIN);
+      assert(top_cod != NULL);
+
+      this->SGCod = top_cod->SGCod;
+      this->top_cod = top_cod;
+      if (file->read(&Lcod, 2) != 2)
+        OJPH_ERROR(0x00050121, "error reading COC segment");
+      Lcod = swap_byte(Lcod);
+      if (num_comps < 257) {
+        ui8 t;
+        if (file->read(&t, 1) != 1)
+          OJPH_ERROR(0x00050122, "error reading COC segment");
+        comp_idx = t;
+      }
+      else {
+        if (file->read(&comp_idx, 2) != 2)
+          OJPH_ERROR(0x00050123, "error reading COC segment");
+        comp_idx = swap_byte(comp_idx);
+      }
+      if (file->read(&Scod, 1) != 1)
+        OJPH_ERROR(0x00050124, "error reading COC segment");
+      if (Scod & 0xF8)
+        OJPH_WARN(0x00050011,
+          "Unsupported options in Scoc field of the COC segment");
+      if (file->read(&SPcod.num_decomp, 1) != 1)
+        OJPH_ERROR(0x00050125, "error reading COC segment");
+      if (file->read(&SPcod.block_width, 1) != 1)
+        OJPH_ERROR(0x00050126, "error reading COC segment");
+      if (file->read(&SPcod.block_height, 1) != 1)
+        OJPH_ERROR(0x00050127, "error reading COC segment");
+      if (file->read(&SPcod.block_style, 1) != 1)
+        OJPH_ERROR(0x00050128, "error reading COC segment");
+      if (file->read(&SPcod.wavelet_trans, 1) != 1)
+        OJPH_ERROR(0x00050129, "error reading COC segment");
+
+      if (get_num_decompositions() > 32
+        || SPcod.block_width > 8
+        || SPcod.block_height > 8
+        || SPcod.block_width + SPcod.block_height > 8
+        || (SPcod.block_style & 0x40) != 0x40
+        || (SPcod.block_style & 0xB7) != 0x00)
+        OJPH_ERROR(0x0005012C, "wrong settings in a COC-SPcoc parameter");
+      if ((SPcod.block_style & 0x40) != 0x40
+        || (SPcod.block_style & 0xB7) != 0x00)
+        OJPH_ERROR(0x0005012D, "unsupported settings in a COC-SPcoc parameter");
+
+      ui8 num_decompositions =  get_num_decompositions();
+      if (Scod & 1)
+        for (int i = 0; i <= num_decompositions; ++i)
+          if (file->read(&SPcod.precinct_size[i], 1) != 1)
+            OJPH_ERROR(0x0005012A, "error reading COC segment");
+      ui32 t = 9;
+      t += num_comps < 257 ? 0 : 1;
+      t += (Scod & 1) ? 1 + num_decompositions : 0;
+      if (Lcod != t)
+        OJPH_ERROR(0x0005012B, "error in COC segment length");
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_cod::update_atk(param_atk* atk)
+    {
+      assert(type == COD_MAIN);
+      this->atk = atk->get_atk(SPcod.wavelet_trans);
+      if (this->atk == NULL)
+        OJPH_ERROR(0x00050131, "A COD segment employs the DWT kernel "
+          "atk = %d, but a corresponding ATK segment cannot be found.",
+          SPcod.wavelet_trans);
+      param_cod *p = next;
+      while (p)
+      {
+        p->atk = atk->get_atk(p->SPcod.wavelet_trans);
+        if (p->atk == NULL)
+          OJPH_ERROR(0x00050132, "A COC segment employs the DWT kernel "
+            "atk = %d, but a corresponding ATK segment cannot be found",
+            SPcod.wavelet_trans);
+        p = p->next;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    const param_cod* param_cod::get_coc(ui32 comp_idx) const
+    {
+      assert(this->type == COD_MAIN || this->top_cod->type == COD_MAIN);
+      const param_cod *p, *q;
+      if (this->type == COD_MAIN)
+        q = p = this;
+      else
+        q = p = this->top_cod;
+      while (p && p->comp_idx != comp_idx)
+        p = p->next;
+      return p ? p : q;
+    }
+
+    ////////////////////////////////////////
+    param_cod* param_cod::get_coc(ui32 comp_idx)
+    {
+      // cast object to constant
+      const param_cod* const_p = const_cast<const param_cod*>(this);
+      // call using the constant object, then cast to non-const
+      return const_cast<param_cod*>(const_p->get_coc(comp_idx));
+    }
+
+    ////////////////////////////////////////
+    param_cod* param_cod::add_coc_object(ui32 comp_idx)
+    {
+      assert(type == COD_MAIN);
+      param_cod *p = this;
+      while (p->next != NULL)
+        p = p->next;
+      if (avail)
+      {
+        p->next = avail;
+        avail = avail->next;
+        p->next->init(this, (ui16)comp_idx);
+      }
+      else
+        p->next = new param_cod(this, (ui16)comp_idx);
+      return p->next;
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -790,32 +1133,195 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void param_qcd::set_rev_quant(ui32 bit_depth,
+    void param_qcd::check_validity(const param_siz& siz, const param_cod& cod)
+    {
+      ui32 num_comps = siz.get_num_components();
+      trim_non_existing_components(num_comps);
+
+      // first check that all the component captured by QCD have the same
+      // bit_depth and signedness
+      bool all_same = true;
+      bool other_comps_exist = false;
+      ui32 first_comp = 0xFFFF; // an impossible component
+      {
+        ui32 num_decompositions = 0;
+        ui32 bit_depth = 0;
+        bool is_signed = false;
+        ui32 wavelet_kern = param_cod::DWT_IRV97;
+
+        for (ui32 c = 0; c < num_comps; ++c)
+        {
+          if (get_qcc(c) == this) // no qcc defined for component c
+          {
+            const param_cod *p = cod.get_coc(c);
+            if (bit_depth == 0) // first component captured by QCD
+            {
+              num_decompositions = p->get_num_decompositions();
+              bit_depth = siz.get_bit_depth(c);
+              is_signed = siz.is_signed(c);
+              wavelet_kern = p->get_wavelet_kern();
+              first_comp = c;
+            }
+            else
+            {
+              all_same = all_same
+                && (num_decompositions == p->get_num_decompositions())
+                && (bit_depth == siz.get_bit_depth(c))
+                && (is_signed == siz.is_signed(c))
+                && (wavelet_kern == p->get_wavelet_kern());
+            }
+          }
+          else
+            other_comps_exist = true;
+        }
+      }
+
+      // configure QCD according COD
+      ui32 qcd_num_decompositions;
+      ui32 qcd_bit_depth;
+      bool qcd_is_signed;
+      ui32 qcd_wavelet_kern;
+      {
+        ui32 qcd_component = first_comp != 0xFFFF ? first_comp : 0;
+        bool employing_color_transform = cod.is_employing_color_transform();
+        qcd_num_decompositions = cod.get_num_decompositions();
+        qcd_bit_depth = siz.get_bit_depth(qcd_component);
+        qcd_is_signed = siz.is_signed(qcd_component);
+        qcd_wavelet_kern = cod.get_wavelet_kern();
+        this->num_subbands = 1 + 3 * qcd_num_decompositions;
+        if (qcd_wavelet_kern == param_cod::DWT_REV53)
+          set_rev_quant(qcd_num_decompositions, qcd_bit_depth,
+            qcd_component < 3 ? employing_color_transform : false);
+        else if (qcd_wavelet_kern == param_cod::DWT_IRV97)
+        {
+          if (this->base_delta == -1.0f)
+            this->base_delta = 1.0f / (float)(1 << qcd_bit_depth);
+          set_irrev_quant(qcd_num_decompositions);
+        }
+        else
+          assert(0);
+      }
+
+      // if not all the same and captured by QCD, then create QCC for them
+      if (!all_same)
+      {
+        bool employing_color_transform = cod.is_employing_color_transform();
+        for (ui32 c = 0; c < num_comps; ++c)
+        {
+          const param_cod *cp = cod.get_coc(c);
+          if (qcd_num_decompositions == cp->get_num_decompositions()
+              && qcd_bit_depth == siz.get_bit_depth(c)
+              && qcd_is_signed == siz.is_signed(c)
+              && qcd_wavelet_kern == cp->get_wavelet_kern())
+            continue; // captured by QCD
+
+          // Does not match QCD, must have QCC
+          param_qcd *qp = get_qcc(c);
+          if (qp == this) // no QCC was defined, create QCC
+            qp = this->add_qcc_object(c);
+
+          ui32 num_decompositions = cp->get_num_decompositions();
+          qp->num_subbands = 1 + 3 * num_decompositions;
+          ui32 bit_depth = siz.get_bit_depth(c);
+          if (cp->get_wavelet_kern() == param_cod::DWT_REV53)
+            qp->set_rev_quant(num_decompositions, bit_depth,
+              c < 3 ? employing_color_transform : false);
+          else if (cp->get_wavelet_kern() == param_cod::DWT_IRV97)
+          {
+            if (qp->base_delta == -1.0f)
+              qp->base_delta = 1.0f / (float)(1 << bit_depth);
+            qp->set_irrev_quant(num_decompositions);
+          }
+          else
+            assert(0);
+        }
+      }
+      else if (other_comps_exist) // Some are captured by QCD
+      {
+        bool employing_color_transform = cod.is_employing_color_transform();
+        for (ui32 c = 0; c < num_comps; ++c)
+        {
+          param_qcd *qp = get_qcc(c);
+          if (qp == this) // if captured by QCD continue
+            continue;
+          const param_cod *cp = cod.get_coc(c);
+          ui32 num_decompositions = cp->get_num_decompositions();
+          qp->num_subbands = 1 + 3 * num_decompositions;
+          ui32 bit_depth = siz.get_bit_depth(c);
+          if (cp->get_wavelet_kern() == param_cod::DWT_REV53)
+            qp->set_rev_quant(num_decompositions, bit_depth,
+              c < 3 ? employing_color_transform : false);
+          else if (cp->get_wavelet_kern() == param_cod::DWT_IRV97)
+          {
+            if (qp->base_delta == -1.0f)
+              qp->base_delta = 1.0f / (float)(1 << bit_depth);
+            qp->set_irrev_quant(num_decompositions);
+          }
+          else
+            assert(0);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_qcd::set_delta(ui32 comp_idx, float delta)
+    {
+      assert(type == QCD_MAIN);
+      param_qcd *p = get_qcc(comp_idx);
+      if (p == NULL)
+        p = add_qcc_object(comp_idx);
+      p->set_delta(delta);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_qcd::set_rev_quant(ui32 num_decomps, ui32 bit_depth,
                                   bool is_employing_color_transform)
     {
-      int guard_bits = 1;
-      Sqcd = (ui8)(guard_bits << 5); //one guard bit, and no quantization
       ui32 B = bit_depth;
       B += is_employing_color_transform ? 1 : 0; //1 bit for RCT
       int s = 0;
-      float bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true);
-      //we leave some leeway for numerical error by multiplying by 1.1f
-      ui32 X = (ui32) ceil(log(bibo_l * bibo_l * 1.1f) / M_LN2);
-      u8_SPqcd[s++] = (ui8)((B + X) << 3);
+      double bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true);
+      ui32 X = (ui32) ceil(log(bibo_l * bibo_l) / M_LN2);
+      SPqcd.u8[s++] = (ui8)(B + X);
+      ui32 max_B_plus_X = (ui32)(B + X);
+      for (ui32 d = num_decomps; d > 0; --d)
+      {
+        double bibo_l = bibo_gains::get_bibo_gain_l(d, true);
+        double bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true);
+        X = (ui32) ceil(log(bibo_h * bibo_l) / M_LN2);
+        SPqcd.u8[s++] = (ui8)(B + X);
+        max_B_plus_X = ojph_max(max_B_plus_X, B + X);
+        SPqcd.u8[s++] = (ui8)(B + X);
+        max_B_plus_X = ojph_max(max_B_plus_X, B + X);
+        X = (ui32) ceil(log(bibo_h * bibo_h) / M_LN2);
+        SPqcd.u8[s++] = (ui8)(B + X);
+        max_B_plus_X = ojph_max(max_B_plus_X, B + X);
+      }
+
+      if (max_B_plus_X > 38)
+        OJPH_ERROR(0x00050151, "The specified combination of bit_depth, "
+         "colour transform, and type of wavelet transform requires more than "
+         "38 bits; it requires %d bits. This is beyond what is allowed in "
+         "the JPEG2000 image coding format.", max_B_plus_X);
+
+      int guard_bits = ojph_max(1, (si32)max_B_plus_X - 31);
+      Sqcd = (ui8)(guard_bits << 5);
+      s = 0;
+      SPqcd.u8[s] = encode_SPqcd((ui8)(SPqcd.u8[s] - guard_bits));
+      s++;
       for (ui32 d = num_decomps; d > 0; --d)
       {
-        float bibo_l = bibo_gains::get_bibo_gain_l(d, true);
-        float bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true);
-        X = (ui32) ceil(log(bibo_h * bibo_l * 1.1f) / M_LN2);
-        u8_SPqcd[s++] = (ui8)((B + X) << 3);
-        u8_SPqcd[s++] = (ui8)((B + X) << 3);
-        X = (ui32) ceil(log(bibo_h * bibo_h * 1.1f) / M_LN2);
-        u8_SPqcd[s++] = (ui8)((B + X) << 3);
+        SPqcd.u8[s] = encode_SPqcd((ui8)(SPqcd.u8[s] - guard_bits));
+        s++;
+        SPqcd.u8[s] = encode_SPqcd((ui8)(SPqcd.u8[s] - guard_bits));
+        s++;
+        SPqcd.u8[s] = encode_SPqcd((ui8)(SPqcd.u8[s] - guard_bits));
+        s++;
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void param_qcd::set_irrev_quant()
+    void param_qcd::set_irrev_quant(ui32 num_decomps)
     {
       int guard_bits = 1;
       Sqcd = (ui8)((guard_bits<<5)|0x2);//one guard bit, scalar quantization
@@ -829,7 +1335,7 @@ namespace ojph {
       // but that should not happen in reality
       mantissa = (int)round(delta_b * (float)(1<<11)) - (1<<11);
       mantissa = mantissa < (1<<11) ? mantissa : 0x7FF;
-      u16_SPqcd[s++] = (ui16)((exp << 11) | mantissa);
+      SPqcd.u16[s++] = (ui16)((exp << 11) | mantissa);
       for (ui32 d = num_decomps; d > 0; --d)
       {
         float gain_l = sqrt_energy_gains::get_gain_l(d, false);
@@ -842,8 +1348,8 @@ namespace ojph {
         { exp++; delta_b *= 2.0f; }
         mantissa = (int)round(delta_b * (float)(1<<11)) - (1<<11);
         mantissa = mantissa < (1<<11) ? mantissa : 0x7FF;
-        u16_SPqcd[s++] = (ui16)((exp << 11) | mantissa);
-        u16_SPqcd[s++] = (ui16)((exp << 11) | mantissa);
+        SPqcd.u16[s++] = (ui16)((exp << 11) | mantissa);
+        SPqcd.u16[s++] = (ui16)((exp << 11) | mantissa);
 
         delta_b = base_delta / (gain_h * gain_h);
 
@@ -852,47 +1358,100 @@ namespace ojph {
         { exp++; delta_b *= 2.0f; }
         mantissa = (int)round(delta_b * (float)(1<<11)) - (1<<11);
         mantissa = mantissa < (1<<11) ? mantissa : 0x7FF;
-        u16_SPqcd[s++] = (ui16)((exp << 11) | mantissa);
+        SPqcd.u16[s++] = (ui16)((exp << 11) | mantissa);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 param_qcd::get_MAGBp() const
-    { //this can be written better, but it is only executed once
+    ui32 param_qcd::get_MAGB() const
+    {
       ui32 B = 0;
-      int irrev = Sqcd & 0x1F;
-      if (irrev == 0) //reversible
-        for (ui32 i = 0; i < 3 * num_decomps + 1; ++i)
-          B = ojph_max(B, (u8_SPqcd[i] >> 3) + get_num_guard_bits() - 1u);
-      else if (irrev == 2) //scalar expounded
-        for (ui32 i = 0; i < 3 * num_decomps + 1; ++i)
-        {
-          ui32 nb = num_decomps - (i ? (i - 1) / 3 : 0); //decompsition level
-          B = ojph_max(B, (u16_SPqcd[i] >> 11) + get_num_guard_bits() - nb);
-        }
-      else
-        assert(0);
+
+      const param_qcd *p = this;
+      while (p)
+      {
+        //this can be written better, but it is only executed once
+        // this assumes a bi-directional wavelet (conventional DWT)
+        ui32 num_decomps = (p->num_subbands - 1) / 3;
+
+        int irrev = p->Sqcd & 0x1F;
+        if (irrev == 0) //reversible
+          for (ui32 i = 0; i < p->num_subbands; ++i) {
+            ui32 t = p->decode_SPqcd(p->SPqcd.u8[i]);
+            t += p->get_num_guard_bits() - 1u;
+            B = ojph_max(B, t);
+          }
+        else if (irrev == 2) //scalar expounded
+          for (ui32 i = 0; i < p->num_subbands; ++i)
+          {
+            ui32 nb = num_decomps - (i ? (i - 1) / 3 : 0); //decompsition level
+            ui32 t = (p->SPqcd.u16[i] >> 11) + p->get_num_guard_bits() - nb;
+            B = ojph_max(B, t);
+          }
+        else
+          assert(0);
+
+        p = p->next;
+      }
 
       return B;
     }
 
     //////////////////////////////////////////////////////////////////////////
-    float param_qcd::irrev_get_delta(ui32 resolution, ui32 subband) const
+    float param_qcd::get_irrev_delta(const param_dfs* dfs,
+                                     ui32 num_decompositions,
+                                     ui32 resolution, ui32 subband) const
     {
-      assert((resolution == 0 && subband == 0) ||
-             (resolution <= num_decomps && subband > 0 && subband<4));
-      assert((Sqcd & 0x1F) == 2);
       float arr[] = { 1.0f, 2.0f, 2.0f, 4.0f };
+      assert((Sqcd & 0x1F) == 2);
 
-      ui32 idx = resolution == 0 ? 0 : (resolution - 1) * 3 + subband;
-      int eps = u16_SPqcd[idx] >> 11;
+      ui32 idx;
+      if (dfs != NULL && dfs->exists())
+        idx = dfs->get_subband_idx(num_decompositions, resolution, subband);
+      else
+        idx = resolution ? (resolution - 1) * 3 + subband : 0;
+      if (idx >= num_subbands) {
+        OJPH_INFO(0x00050101, "Trying to access quantization step size for "
+          "subband %d when the QCD/QCC marker segment specifies "
+          "quantization step sizes for %d subbands only.  To continue "
+          "decoding, we are using the step size for subband %d, which can "
+          "produce incorrect results",
+          idx + 1, num_subbands, num_subbands - 1);
+        idx = num_subbands - 1;
+      }
+      int eps = SPqcd.u16[idx] >> 11;
       float mantissa;
-      mantissa = (float)((u16_SPqcd[idx] & 0x7FF) | 0x800) * arr[subband];
+      mantissa = (float)((SPqcd.u16[idx] & 0x7FF) | 0x800) * arr[subband];
       mantissa /= (float)(1 << 11);
       mantissa /= (float)(1u << eps);
       return mantissa;
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    ui32 param_qcd::propose_precision(const param_cod* cod) const
+    {
+      ui32 comp_idx = cod->get_comp_idx();
+      ui32 precision = 0;
+      const param_cod *main =
+        cod->get_coc(param_cod::OJPH_COD_DEFAULT);
+      if (main->is_employing_color_transform() && comp_idx < 3)
+      {
+        for (ui32 i = 0; i < 3; ++i) {
+          const param_qcd* p = this->get_qcc(i);
+          precision = ojph_max(precision, p->get_largest_Kmax());
+        }
+      }
+      else {
+        precision = get_largest_Kmax();
+      }
+      // ``precision'' now holds the largest K_max, which excludes the sign
+      // bit.
+      // + 1 for the sign bit
+      // + 1 because my block decoder/encoder does not supports up to 30
+      //     bits (not 31), so we bump it by one more bit.
+      return precision + 1 + 1;
+    }
+
     //////////////////////////////////////////////////////////////////////////
     ui32 param_qcd::get_num_guard_bits() const
     {
@@ -900,33 +1459,72 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 param_qcd::get_Kmax(ui32 resolution, ui32 subband) const
+    ui32 param_qcd::get_Kmax(const param_dfs* dfs, ui32 num_decompositions,
+                             ui32 resolution, ui32 subband) const
     {
-      assert((resolution == 0 && subband == 0) ||
-             (resolution <= num_decomps && subband > 0 && subband<4));
-      ui32 num_bits = get_num_guard_bits();
-      ui32 idx = resolution == 0 ? 0 : (resolution - 1) * 3 + subband;
+      ui32 idx;
+      if (dfs != NULL && dfs->exists())
+        idx = dfs->get_subband_idx(num_decompositions, resolution, subband);
+      else
+        idx = resolution ? (resolution - 1) * 3 + subband : 0;
+      if (idx >= num_subbands) {
+        OJPH_INFO(0x00050111, "Trying to access quantization step size for "
+          "subband %d when the QCD/QCC marker segment specifies "
+          "quantization step sizes for %d subbands only.  To continue "
+          "decoding, we are using the step size for subband %d, which can "
+          "produce incorrect results",
+          idx + 1, num_subbands, num_subbands - 1);
+        idx = num_subbands - 1;
+      }
+
       int irrev = Sqcd & 0x1F;
-      if (irrev == 0) //reversible; this is (10.22) from the J2K book
+      ui32 num_bits = 0;
+      if (irrev == 0) // reversible; this is (10.22) from the J2K book
       {
-        num_bits += u8_SPqcd[idx] >> 3;
+        num_bits = decode_SPqcd(SPqcd.u8[idx]);
         num_bits = num_bits == 0 ? 0 : num_bits - 1;
       }
       else if (irrev == 1)
         assert(0);
       else if (irrev == 2) //scalar expounded
-        num_bits += (u16_SPqcd[idx] >> 11) - 1;
+        num_bits = (SPqcd.u16[idx] >> 11) - 1;
+      else
+        assert(0);
+
+      return num_bits + get_num_guard_bits();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    ui32 param_qcd::get_largest_Kmax() const
+    {
+      int irrev = Sqcd & 0x1F;
+      ui32 num_bits = 0;
+      if (irrev == 0) // reversible; this is (10.22) from the J2K book
+      {
+        for (ui32 i = 0; i < num_subbands; ++i) {
+          ui32 t = decode_SPqcd(SPqcd.u8[i]);
+          num_bits = ojph_max(num_bits, t == 0 ? 0 : t - 1);
+        }
+      }
+      else if (irrev == 1)
+        assert(0);
+      else if (irrev == 2) //scalar expounded
+      {
+        for (ui32 i = 0; i < num_subbands; ++i) {
+          ui32 t = (SPqcd.u16[i] >> 11) - 1;
+          num_bits = ojph_max(num_bits, t);
+        }
+      }
       else
         assert(0);
 
-      return num_bits;
+      return num_bits + get_num_guard_bits();
     }
 
     //////////////////////////////////////////////////////////////////////////
     bool param_qcd::write(outfile_base *file)
     {
       int irrev = Sqcd & 0x1F;
-      ui32 num_subbands = 1 + 3 * num_decomps;
 
       //marker size excluding header
       Lqcd = 3;
@@ -951,21 +1549,101 @@ namespace ojph {
       if (irrev == 0)
         for (ui32 i = 0; i < num_subbands; ++i)
         {
-          *(ui8*)buf = u8_SPqcd[i];
+          *(ui8*)buf = SPqcd.u8[i];
           result &= file->write(&buf, 1) == 1;
         }
       else if (irrev == 2)
         for (ui32 i = 0; i < num_subbands; ++i)
         {
-          *(ui16*)buf = swap_byte(u16_SPqcd[i]);
+          *(ui16*)buf = swap_byte(SPqcd.u16[i]);
           result &= file->write(&buf, 2) == 2;
         }
       else
         assert(0);
 
-
       return result;
     }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_qcd::write_qcc(outfile_base *file, ui32 num_comps)
+    {
+      assert(type == QCD_MAIN);
+      bool result = true;
+      param_qcd *p = this->next;
+      while (p)
+      {
+        if (p->enabled)
+          result &= p->internal_write_qcc(file, num_comps);
+        p = p->next;
+      }
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_qcd::internal_write_qcc(outfile_base *file, ui32 num_comps)
+    {
+      int irrev = Sqcd & 0x1F;
+
+      //marker size excluding header
+      Lqcd = (ui16)(4 + (num_comps < 257 ? 0 : 1));
+      if (irrev == 0)
+        Lqcd = (ui16)(Lqcd + num_subbands);
+      else if (irrev == 2)
+        Lqcd = (ui16)(Lqcd + 2 * num_subbands);
+      else
+        assert(0);
+
+      char buf[4];
+      bool result = true;
+
+      *(ui16*)buf = JP2K_MARKER::QCC;
+      *(ui16*)buf = swap_byte(*(ui16*)buf);
+      result &= file->write(&buf, 2) == 2;
+      *(ui16*)buf = swap_byte(Lqcd);
+      result &= file->write(&buf, 2) == 2;
+      if (num_comps < 257)
+      {
+        *(ui8*)buf = (ui8)comp_idx;
+        result &= file->write(&buf, 1) == 1;
+      }
+      else
+      {
+        *(ui16*)buf = swap_byte(comp_idx);
+        result &= file->write(&buf, 2) == 2;
+      }
+      *(ui8*)buf = Sqcd;
+      result &= file->write(&buf, 1) == 1;
+      if (irrev == 0)
+        for (ui32 i = 0; i < num_subbands; ++i)
+        {
+          *(ui8*)buf = SPqcd.u8[i];
+          result &= file->write(&buf, 1) == 1;
+        }
+      else if (irrev == 2)
+        for (ui32 i = 0; i < num_subbands; ++i)
+        {
+          *(ui16*)buf = swap_byte(SPqcd.u16[i]);
+          result &= file->write(&buf, 2) == 2;
+        }
+      else
+        assert(0);
+
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_qcd::trim_non_existing_components(ui32 num_comps)
+    {
+      assert(type == QCD_MAIN && comp_idx == OJPH_QCD_DEFAULT);
+      param_qcd *p = this->next;
+      while (p)
+      {
+        assert(p->type == QCC_MAIN);
+        p->enabled = p->comp_idx < num_comps;
+        p = p->next;
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void param_qcd::read(infile_base *file)
     {
@@ -976,31 +1654,37 @@ namespace ojph {
         OJPH_ERROR(0x00050082, "error reading QCD marker");
       if ((Sqcd & 0x1F) == 0)
       {
-        num_decomps = (Lqcd - 4) / 3;
-        if (Lqcd != 4 + 3 * num_decomps)
-          OJPH_ERROR(0x00050083, "wrong Lqcd value in QCD marker");
-        for (ui32 i = 0; i < 1 + 3 * num_decomps; ++i)
-          if (file->read(&u8_SPqcd[i], 1) != 1)
+        num_subbands = (Lqcd - 3);
+        if (num_subbands == 0)
+          OJPH_ERROR(0x0005008A, "QCD marker segment that specifies no "
+            "quantization informtion");
+        if (num_subbands > 97 || Lqcd != 3 + num_subbands)
+          OJPH_ERROR(0x00050083, "wrong Lqcd value of %d in QCD marker", Lqcd);
+        for (ui32 i = 0; i < num_subbands; ++i)
+          if (file->read(&SPqcd.u8[i], 1) != 1)
             OJPH_ERROR(0x00050084, "error reading QCD marker");
       }
       else if ((Sqcd & 0x1F) == 1)
       {
-        num_decomps = 0;
-        OJPH_ERROR(0x00050089, 
+        num_subbands = 0;
+        OJPH_ERROR(0x00050089,
           "Scalar derived quantization is not supported yet in QCD marker");
         if (Lqcd != 5)
           OJPH_ERROR(0x00050085, "wrong Lqcd value in QCD marker");
       }
       else if ((Sqcd & 0x1F) == 2)
       {
-        num_decomps = (Lqcd - 5) / 6;
-        if (Lqcd != 5 + 6 * num_decomps)
-          OJPH_ERROR(0x00050086, "wrong Lqcd value in QCD marker");
-        for (ui32 i = 0; i < 1 + 3 * num_decomps; ++i)
+        num_subbands = (Lqcd - 3) / 2;
+        if (num_subbands == 0)
+          OJPH_ERROR(0x0005008B, "QCD marker segment that specifies no "
+            "quantization informtion");
+        if (num_subbands > 97 || Lqcd != 3 + 2 * num_subbands)
+          OJPH_ERROR(0x00050086, "wrong Lqcd value of %d in QCD marker", Lqcd);
+        for (ui32 i = 0; i < num_subbands; ++i)
         {
-          if (file->read(&u16_SPqcd[i], 2) != 2)
+          if (file->read(&SPqcd.u16[i], 2) != 2)
             OJPH_ERROR(0x00050087, "error reading QCD marker");
-          u16_SPqcd[i] = swap_byte(u16_SPqcd[i]);
+          SPqcd.u16[i] = swap_byte(SPqcd.u16[i]);
         }
       }
       else
@@ -1008,15 +1692,7 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    //
-    //
-    //
-    //
-    //
-    //////////////////////////////////////////////////////////////////////////
-
-    //////////////////////////////////////////////////////////////////////////
-    void param_qcc::read(infile_base *file, ui32 num_comps)
+    void param_qcd::read_qcc(infile_base *file, ui32 num_comps)
     {
       if (file->read(&Lqcd, 2) != 2)
         OJPH_ERROR(0x000500A1, "error reading QCC marker");
@@ -1036,42 +1712,334 @@ namespace ojph {
       }
       if (file->read(&Sqcd, 1) != 1)
         OJPH_ERROR(0x000500A4, "error reading QCC marker");
+      ui32 offset = num_comps < 257 ? 4 : 5;
       if ((Sqcd & 0x1F) == 0)
       {
-        ui32 offset = num_comps < 257 ? 5 : 6;
-        num_decomps = (Lqcd - offset) / 3;
-        if (Lqcd != offset + 3 * num_decomps)
-          OJPH_ERROR(0x000500A5, "wrong Lqcd value in QCC marker");
-        for (ui32 i = 0; i < 1 + 3 * num_decomps; ++i)
-          if (file->read(&u8_SPqcd[i], 1) != 1)
+        num_subbands = (Lqcd - offset);
+        if (num_subbands == 0)
+          OJPH_ERROR(0x000500AC, "QCC marker segment that specifies no "
+            "quantization informtion");
+        if (num_subbands > 97 || Lqcd != offset + num_subbands)
+          OJPH_ERROR(0x000500A5, "wrong Lqcd value of %d in QCC marker", Lqcd);
+        for (ui32 i = 0; i < num_subbands; ++i)
+          if (file->read(&SPqcd.u8[i], 1) != 1)
             OJPH_ERROR(0x000500A6, "error reading QCC marker");
       }
       else if ((Sqcd & 0x1F) == 1)
       {
-        ui32 offset = num_comps < 257 ? 6 : 7;
-        num_decomps = 0;
-        OJPH_ERROR(0x000500AB, 
+        num_subbands = 0;
+        OJPH_ERROR(0x000500AB,
           "Scalar derived quantization is not supported yet in QCC marker");
         if (Lqcd != offset)
           OJPH_ERROR(0x000500A7, "wrong Lqcc value in QCC marker");
       }
       else if ((Sqcd & 0x1F) == 2)
       {
-        ui32 offset = num_comps < 257 ? 6 : 7;
-        num_decomps = (Lqcd - offset) / 6;
-        if (Lqcd != offset + 6 * num_decomps)
-          OJPH_ERROR(0x000500A8, "wrong Lqcc value in QCC marker");
-        for (ui32 i = 0; i < 1 + 3 * num_decomps; ++i)
+        num_subbands = (Lqcd - offset) / 2;
+        if (num_subbands == 0)
+          OJPH_ERROR(0x000500AD, "QCC marker segment that specifies no "
+            "quantization informtion");
+        if (num_subbands > 97 || Lqcd != offset + 2 * num_subbands)
+          OJPH_ERROR(0x000500A8, "wrong Lqcc value of %d in QCC marker", Lqcd);
+        for (ui32 i = 0; i < num_subbands; ++i)
         {
-          if (file->read(&u16_SPqcd[i], 2) != 2)
+          if (file->read(&SPqcd.u16[i], 2) != 2)
             OJPH_ERROR(0x000500A9, "error reading QCC marker");
-          u16_SPqcd[i] = swap_byte(u16_SPqcd[i]);
+          SPqcd.u16[i] = swap_byte(SPqcd.u16[i]);
         }
       }
       else
         OJPH_ERROR(0x000500AA, "wrong Sqcc value in QCC marker");
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    param_qcd* param_qcd::get_qcc(ui32 comp_idx)
+    {
+      // cast object to constant
+      const param_qcd* const_p = const_cast<const param_qcd*>(this);
+      // call using the constant object, then cast to non-const
+      return const_cast<param_qcd*>(const_p->get_qcc(comp_idx));
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    const param_qcd* param_qcd::get_qcc(ui32 comp_idx) const
+    {
+      assert(this->type == QCD_MAIN || this->top_qcd->type == QCD_MAIN);
+      const param_qcd *p, *q;
+      if (this->type == QCD_MAIN)
+        q = p = this;
+      else
+        q = p = this->top_qcd;
+      while (p && p->comp_idx != comp_idx)
+        p = p->next;
+      return p ? p : q;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    param_qcd* param_qcd::add_qcc_object(ui32 comp_idx)
+    {
+      assert(type == QCD_MAIN);
+      param_qcd *p = this;
+      while (p->next != NULL)
+        p = p->next;
+      if (avail)
+      {
+        p->next = avail;
+        avail = avail->next;
+        p->next->init(this, (ui16)comp_idx);
+      }
+      else
+        p->next = new param_qcd(this, (ui16)comp_idx);
+      return p->next;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_nlt::check_validity(param_siz& siz)
+    {
+      if (is_any_enabled() == false)
+        return;
+
+      if (this->enabled && this->Tnlt == nonlinearity::OJPH_NLT_NO_NLT)
+        this->enabled = false;
+
+      if (this->enabled &&
+          this->Tnlt == nonlinearity::OJPH_NLT_BINARY_COMPLEMENT_NLT)
+      {
+        bool all_same = true;
+        ui32 num_comps = siz.get_num_components();
+
+        // first stage; find out if all components captured by the default
+        // entry (ALL_COMPS) has the same bit_depth/signedness,
+        // while doing this, set the BDnlt for components not captured by the
+        // default entry (ALL_COMPS)
+        ui32 bit_depth = 0;      // unknown yet
+        bool is_signed = false;  // unknown yet
+        for (ui32 c = 0; c < num_comps; ++c)
+        { // captured by ALL_COMPS
+          param_nlt* p = get_nlt_object(c);
+          if (p == NULL || !p->enabled)
+          {
+            if (bit_depth != 0)
+            {
+              // we have seen an undefined component previously
+              all_same = all_same && (bit_depth == siz.get_bit_depth(c));
+              all_same = all_same && (is_signed == siz.is_signed(c));
+            }
+            else
+            {
+              // this is the first component which has not type 3 nlt definition
+              bit_depth = siz.get_bit_depth(c);
+              is_signed = siz.is_signed(c);
+            }
+          }
+          else
+          { // can be type 0 or type 3
+            p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1);
+            p->BDnlt = (ui8)(p->BDnlt | (siz.is_signed(c) ? 0x80 : 0));
+          }
+        }
+
+        if (all_same && bit_depth != 0)
+        { // all the same, and some components are captured by ALL_COMPS
+          this->BDnlt = (ui8)(bit_depth - 1);
+          this->BDnlt = (ui8)(this->BDnlt | (is_signed ? 0x80 : 0));
+        }
+        else if (!all_same)
+        { // have different settings or no component is captured by ALL_COMPS
+          this->enabled = false;
+          for (ui32 c = 0; c < num_comps; ++c)
+          {
+            param_nlt* p = get_nlt_object(c);
+            if (p == NULL || !p->enabled)
+            { // captured by ALL_COMPS
+              if (p == NULL)
+                p = add_object(c);
+              p->enabled = true;
+              p->Tnlt = nonlinearity::OJPH_NLT_BINARY_COMPLEMENT_NLT;
+              p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1);
+              p->BDnlt = (ui8)(p->BDnlt | (siz.is_signed(c) ? 0x80 : 0));
+            }
+          }
+        }
+      }
+      else {
+        // fill NLT segment markers with correct information
+        ui32 num_comps = siz.get_num_components();
+        for (ui32 c = 0; c < num_comps; ++c)
+        { // captured by ALL_COMPS
+          param_nlt* p = get_nlt_object(c);
+          if (p != NULL && p->enabled)
+          { // can be type 0 or type 3
+            p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1);
+            p->BDnlt = (ui8)(p->BDnlt | (siz.is_signed(c) ? 0x80 : 0));
+          }
+        }
+      }
+
+      trim_non_existing_components(siz.get_num_components());
+
+      if (is_any_enabled() == true)
+        siz.set_Rsiz_flag(param_siz::RSIZ_EXT_FLAG | param_siz::RSIZ_NLT_FLAG);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_nlt::set_nonlinear_transform(ui32 comp_num, ui8 nl_type)
+    {
+      if (nl_type != ojph::param_nlt::OJPH_NLT_NO_NLT &&
+          nl_type != ojph::param_nlt::OJPH_NLT_BINARY_COMPLEMENT_NLT)
+      OJPH_ERROR(0x00050171, "Nonliearities other than type 0 "
+        "(No Nonlinearity) or type  3 (Binary Binary Complement to Sign "
+        "Magnitude Conversion) are not supported yet");
+      param_nlt* p = get_nlt_object(comp_num);
+      if (p == NULL)
+        p = add_object(comp_num);
+      p->Tnlt = nl_type;
+      p->enabled = true;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool
+    param_nlt::get_nonlinear_transform(ui32 comp_num, ui8& bit_depth,
+                                       bool& is_signed, ui8& nl_type) const
+    {
+      assert(Cnlt == special_comp_num::ALL_COMPS);
+      const param_nlt* p = get_nlt_object(comp_num);
+      p = (p && p->enabled) ? p : this;
+      if (p->enabled)
+      {
+        bit_depth = (ui8)((p->BDnlt & 0x7F) + 1);
+        bit_depth = bit_depth <= 38 ? bit_depth : 38;
+        is_signed = (p->BDnlt & 0x80) == 0x80;
+        nl_type = (nonlinearity)p->Tnlt;
+        return true;
+      }
+      return false;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_nlt::write(outfile_base* file) const
+    {
+      if (is_any_enabled() == false)
+        return true;
+
+      char buf[2];
+      bool result = true;
+      const param_nlt* p = this;
+      while (p)
+      {
+        if (p->enabled)
+        {
+          *(ui16*)buf = JP2K_MARKER::NLT;
+          *(ui16*)buf = swap_byte(*(ui16*)buf);
+          result &= file->write(&buf, 2) == 2;
+          *(ui16*)buf = swap_byte(p->Lnlt);
+          result &= file->write(&buf, 2) == 2;
+          *(ui16*)buf = swap_byte(p->Cnlt);
+          result &= file->write(&buf, 2) == 2;
+          result &= file->write(&p->BDnlt, 1) == 1;
+          result &= file->write(&p->Tnlt, 1) == 1;
+        }
+        p = p->next;
+      }
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_nlt::read(infile_base* file)
+    {
+      ui8 buf[6];
+
+      if (file->read(buf, 6) != 6)
+        OJPH_ERROR(0x00050141, "error reading NLT marker segment");
+
+      ui16 length = swap_byte(*(ui16*)buf);
+      if (length != 6 || (buf[5] != 3 && buf[5] != 0)) // wrong length or type
+        OJPH_ERROR(0x00050142, "Unsupported NLT type %d\n", buf[5]);
+
+      ui16 comp = swap_byte(*(ui16*)(buf + 2));
+      param_nlt* p = get_nlt_object(comp);
+      if (p == NULL)
+        p = add_object(comp);
+      p->enabled = true;
+      p->Cnlt = comp;
+      p->BDnlt = buf[4];
+      p->Tnlt = buf[5];
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    param_nlt* param_nlt::get_nlt_object(ui32 comp_num)
+    {
+      // cast object to constant
+      const param_nlt* const_p = const_cast<const param_nlt*>(this);
+      // call using the constant object, then cast to non-const
+      return const_cast<param_nlt*>(const_p->get_nlt_object(comp_num));
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    const param_nlt* param_nlt::get_nlt_object(ui32 comp_num) const
+    {
+      const param_nlt* p = this;
+      while (p && p->Cnlt != comp_num)
+        p = p->next;
+      return p;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    param_nlt* param_nlt::add_object(ui32 comp_num)
+    {
+      assert(comp_num != special_comp_num::ALL_COMPS);
+      assert(Cnlt == special_comp_num::ALL_COMPS);
+      param_nlt* p = this;
+      while (p->next != NULL) {
+        assert(p->Cnlt != comp_num);
+        p = p->next;
+      }
+      if (avail)
+      {
+        p->next = avail;
+        avail = avail->next;
+        p->next->init();
+      }
+      else
+        p->next = new param_nlt;
+      p = p->next;
+      p->Cnlt = (ui16)comp_num;
+      return p;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_nlt::is_any_enabled() const
+    {
+      // check if any field is enabled
+      const param_nlt* p = this;
+      while (p && p->enabled == false)
+        p = p->next;
+      return (p != NULL);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_nlt::trim_non_existing_components(ui32 num_comps)
+    {
+      param_nlt* p = this->next;
+      while (p) {
+          if (p->enabled == true && p->Cnlt >= num_comps) {
+            p->enabled = false;
+            OJPH_INFO(0x00050161, "The NLT marker segment for the "
+              "non-existing component %d has been removed.", p->Cnlt);
+          }
+        p = p->next;
+      }
+    }
+
+
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -1097,10 +2065,8 @@ namespace ojph {
       result &= file->write(&buf, 2) == 2;
       *(ui32*)buf = swap_byte(Psot);
       result &= file->write(&buf, 4) == 4;
-      *(ui8*)buf = TPsot;
-      result &= file->write(&buf, 1) == 1;
-      *(ui8*)buf = TNsot;
-      result &= file->write(&buf, 1) == 1;
+      result &= file->write(&TPsot, 1) == 1;
+      result &= file->write(&TNsot, 1) == 1;
 
       return result;
     }
@@ -1121,10 +2087,8 @@ namespace ojph {
       result &= file->write(&buf, 2) == 2;
       *(ui32*)buf = swap_byte(payload_len + 14);
       result &= file->write(&buf, 4) == 4;
-      *(ui8*)buf = TPsot;
-      result &= file->write(&buf, 1) == 1;
-      *(ui8*)buf = TNsot;
-      result &= file->write(&buf, 1) == 1;
+      result &= file->write(&TPsot, 1) == 1;
+      result &= file->write(&TNsot, 1) == 1;
 
       return result;
     }
@@ -1137,7 +2101,7 @@ namespace ojph {
         if (file->read(&Lsot, 2) != 2)
         {
           OJPH_INFO(0x00050091, "error reading SOT marker");
-          Lsot = 0; Isot = 0; Psot = 0; TPsot = 0; TNsot = 0; 
+          Lsot = 0; Isot = 0; Psot = 0; TPsot = 0; TNsot = 0;
           return false;
         }
         Lsot = swap_byte(Lsot);
@@ -1221,7 +2185,7 @@ namespace ojph {
                    "In any case, this limit means that we have 10922 "
                    "tileparts or more, which is a huge number.");
       this->num_pairs = num_pairs;
-      pairs = (Ttlm_Ptlm_pair*)store;
+      pairs = store;
       Ltlm = (ui16)(4 + 6 * num_pairs);
       Ztlm = 0;
       Stlm = 0x60;
@@ -1260,6 +2224,402 @@ namespace ojph {
       return result;
     }
 
-  }
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    const param_dfs* param_dfs::get_dfs(int index) const
+    {
+      const param_dfs* p = this;
+      while (p && p->Sdfs != index)
+        p = p->next;
+      return p;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    param_dfs::dfs_dwt_type param_dfs::get_dwt_type(ui32 decomp_level) const
+    {
+      decomp_level = ojph_min(decomp_level, Ids);
+      ui32 d = decomp_level - 1;          // decomp_level starts from 1
+      ui32 idx = d >> 2;                  // complete bytes
+      ui32 bits = d & 0x3;                // bit within the bytes
+      ui32 val = (Ddfs[idx] >> (6 - 2 * bits)) & 0x3;
+      return (dfs_dwt_type)val;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    ui32 param_dfs::get_subband_idx(ui32 num_decompositions, ui32 resolution,
+                                    ui32 subband) const
+    {
+      assert((resolution == 0 && subband == 0) ||
+              (resolution > 0 && subband > 0 && subband < 4));
+
+      ui32 ns[4] = { 0, 3, 1, 1 };
+
+      ui32 idx = 0;
+      if (resolution > 0)
+      {
+        idx = 0;
+        ui32 i = 1;
+        for (; i < resolution; ++i)
+          idx += ns[get_dwt_type(num_decompositions - i + 1)];
+        dfs_dwt_type t = get_dwt_type(num_decompositions - i + 1);
+        idx += subband;
+        if (t == VERT_DWT && subband == 2)
+          --idx;
+      }
+
+      return idx;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    point param_dfs::get_res_downsamp(ui32 skipped_resolutions) const
+    {
+      point factor(1, 1);
+      ui32 decomp_level = 1;
+      while (skipped_resolutions > 0)
+      {
+        param_dfs::dfs_dwt_type type = get_dwt_type(decomp_level);
+        if (type == BIDIR_DWT)
+        { factor.x *= 2; factor.y *= 2; }
+        else if (type == HORZ_DWT)
+          factor.x *= 2;
+        else if (type == VERT_DWT)
+          factor.y *= 2;
+
+        ++decomp_level;
+        --skipped_resolutions;
+      }
+      return factor;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_dfs::read(infile_base *file)
+    {
+      if (Ldfs != 0) { // this param_dfs is used
+        param_dfs* p = this;
+        while (p->next != NULL)
+          p = p->next;
+        if (avail)
+        {
+          p->next = avail;
+          avail = avail->next;
+          p->next->init();
+        }
+        else
+          p->next = new param_dfs;
+        p = p->next;
+        return p->read(file);
+      }
+
+      if (file->read(&Ldfs, 2) != 2)
+        OJPH_ERROR(0x000500D1, "error reading DFS-Ldfs parameter");
+      Ldfs = swap_byte(Ldfs);
+      if (file->read(&Sdfs, 2) != 2)
+        OJPH_ERROR(0x000500D2, "error reading DFS-Sdfs parameter");
+      Sdfs = swap_byte(Sdfs);
+      if (Sdfs > 15)
+        OJPH_ERROR(0x000500D3, "The DFS-Sdfs parameter is %d, which is "
+          "larger than the permissible 15", Sdfs);
+      ui8 t, l_Ids = 0;
+      if (file->read(&l_Ids, 1) != 1)
+        OJPH_ERROR(0x000500D4, "error reading DFS-Ids parameter");
+      constexpr int max_Ddfs = sizeof(Ddfs) * 4;
+      if (l_Ids > max_Ddfs)
+        OJPH_INFO(0x000500D5, "The DFS-Ids parameter is %d; while this is "
+          "valid, the number is unnessarily large -- you do not need more "
+          "than %d.  Please contact me regarding this issue.",
+          l_Ids, max_Ddfs);
+      Ids = l_Ids < max_Ddfs ? l_Ids : max_Ddfs;
+      for (int i = 0; i < Ids; i += 4)
+        if (file->read(&Ddfs[i / 4], 1) != 1)
+          OJPH_ERROR(0x000500D6, "error reading DFS-Ddfs parameters");
+      for (int i = Ids; i < l_Ids; i += 4)
+        if (file->read(&t, 1) != 1)
+          OJPH_ERROR(0x000500D7, "error reading DFS-Ddfs parameters");
+      return true;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    param_atk* param_atk::get_atk(int index)
+    {
+      assert(top_atk == NULL);
+
+      if (Latk == 0)
+      {
+        // This atk object is not used, initialize it to either 0 (irv97)
+        // or 1 (rev53), and use it.  If index is not 0 nor 1, then index
+        // must have been read from file previously, otherwise it is an
+        // error.
+        if (index == 0) { this->init_irv97(); return this; }
+        else if (index == 1) { this->init_rev53(); return this; }
+      }
+
+      param_atk* p = this;
+      while (p && p->get_index() != index)
+        p = p->next;
+
+      if (p == NULL && (index == 0 || index == 1))
+      {
+        // The index was not found, add an atk object only if the index is
+        // either 0 or 1
+        p = add_object();
+        if (index == 0)
+          p->init_irv97();
+        else if (index == 1)
+          p->init_rev53();
+      }
+
+      return p;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_atk::read_coefficient(infile_base *file, float &K, si32& bytes)
+    {
+      int coeff_type = get_coeff_type();
+      if (coeff_type == 0) { // 8bit
+        ui8 v;
+        if (file->read(&v, 1) != 1) return false;
+        bytes -= 1;
+        K = v;
+      }
+      else if (coeff_type == 1) { // 16bit
+        ui16 v;
+        if (file->read(&v, 2) != 2) return false;
+        bytes -= 2;
+        K = swap_byte(v);
+      }
+      else if (coeff_type == 2) { // float
+        union {
+          float f;
+          ui32 i;
+        } v;
+        if (file->read(&v.i, 4) != 4) return false;
+        bytes -= 4;
+        v.i = swap_byte(v.i);
+        K = v.f;
+      }
+      else if (coeff_type == 3) { // double
+        union {
+          double d;
+          ui64 i;
+        } v;
+        if (file->read(&v.i, 8) != 8) return false;
+        bytes -= 8;
+        v.i = swap_byte(v.i);
+        K = (float)v.d;
+      }
+      else if (coeff_type == 4) { // 128 bit float
+        ui64 v, v1;
+        if (file->read(&v, 8) != 8) return false;
+        bytes -= 8;
+        if (file->read(&v1, 8) != 8) return false; // v1 not needed
+        bytes -= 8;
+        v = swap_byte(v);
+
+        union {
+          float f;
+          ui32 i;
+        } s;
+        // convert the MSB of 128b float to 32b float
+        // 32b float has 1 sign bit, 8 exponent (offset 127), 23 mantissa
+        // 128b float has 1 sign bit, 15 exponent (offset 16383), 112 mantissa
+        si32 e = (si32)((v >> 48) & 0x7FFF);   // exponent
+        e -= 16383;
+        e += 127;
+        e = e & 0xFF;                          // removes MSBs if negative
+        e <<= 23;                              // move bits to their location
+        s.i = 0;
+        s.i |= ((ui32)(v >> 32) & 0x80000000); // copy sign bit
+        s.i |= (ui32)e;                        // copy exponent
+        s.i |= (ui32)((v >> 25) & 0x007FFFFF); // copy 23 mantissa
+        K = s.f;
+      }
+      return true;
+    }
+
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_atk::read_coefficient(infile_base *file, si16 &K, si32& bytes)
+    {
+      int coeff_type = get_coeff_type();
+      if (coeff_type == 0) {
+        si8 v;
+        if (file->read(&v, 1) != 1) return false;
+        bytes -= 1;
+        K = v;
+      }
+      else if (coeff_type == 1) {
+        si16 v;
+        if (file->read(&v, 2) != 2) return false;
+        bytes -= 2;
+        K = (si16)swap_byte((ui16)v);
+      }
+      else
+        return false;
+      return true;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_atk::read(infile_base *file)
+    {
+      if (Latk != 0) // this param_atk is used
+        return add_object()->read(file);
+
+      if (file->read(&Latk, 2) != 2)
+        OJPH_ERROR(0x000500E1, "error reading ATK-Latk parameter");
+      Latk = swap_byte(Latk);
+      si32 bytes = Latk - 2;
+      ojph::ui16 temp_Satk;
+      if (file->read(&temp_Satk, 2) != 2)
+        OJPH_ERROR(0x000500E2, "error reading ATK-Satk parameter");
+      bytes -= 2;
+      temp_Satk = swap_byte(temp_Satk);
+      int tmp_idx = temp_Satk & 0xFF;
+      if ((top_atk && top_atk->get_atk(tmp_idx) != NULL)
+        || tmp_idx == 0 || tmp_idx == 1)
+        OJPH_ERROR(0x000500F3, "ATK-Satk parameter sets ATK marker index to "
+          "the illegal value of %d. ATK-Satk should be in (2-255) and, I "
+          "believe, must not be repeated; otherwise, it would be unclear "
+          "what marker segment must be employed when an index is repeated.",
+          tmp_idx);
+      Satk = temp_Satk;
+      if (is_m_init0() == false)  // only even-indexed is supported
+        OJPH_ERROR(0x000500E3, "ATK-Satk parameter sets m_init to 1, "
+          "requiring odd-indexed subsequence in first reconstruction step, "
+          "which is not supported yet.");
+      if (is_whole_sample() == false)  // ARB filter not supported
+        OJPH_ERROR(0x000500E4, "ATK-Satk parameter specified ARB filter, "
+          "which is not supported yet.");
+      if (is_reversible() && get_coeff_type() >= 2) // reversible & float
+        OJPH_ERROR(0x000500E5, "ATK-Satk parameter does not make sense. "
+          "It employs floats with reversible filtering.");
+      if (is_using_ws_extension() == false)  // only sym. ext is supported
+        OJPH_ERROR(0x000500E6, "ATK-Satk parameter requires constant "
+          "boundary extension, which is not supported yet.");
+      if (is_reversible() == false)
+        if (read_coefficient(file, Katk, bytes) == false)
+          OJPH_ERROR(0x000500E7, "error reading ATK-Katk parameter");
+      if (file->read(&Natk, 1) != 1)
+        OJPH_ERROR(0x000500E8, "error reading ATK-Natk parameter");
+      bytes -= 1;
+      if (Natk > max_steps) {
+        if (d != d_store) // was this allocated -- very unlikely
+          delete[] d;
+        d = new lifting_step[Natk];
+        max_steps = Natk;
+      }
+
+      if (is_reversible())
+      {
+        for (int s = 0; s < Natk; ++s)
+        {
+          if (file->read(&d[s].rev.Eatk, 1) != 1)
+            OJPH_ERROR(0x000500E9, "error reading ATK-Eatk parameter");
+          bytes -= 1;
+          if (file->read(&d[s].rev.Batk, 2) != 2)
+            OJPH_ERROR(0x000500EA, "error reading ATK-Batk parameter");
+          bytes -= 2;
+          d[s].rev.Batk = (si16)swap_byte((ui16)d[s].rev.Batk);
+          ui8 LCatk;
+          if (file->read(&LCatk, 1) != 1)
+            OJPH_ERROR(0x000500EB, "error reading ATK-LCatk parameter");
+          bytes -= 1;
+          if (LCatk == 0)
+            OJPH_ERROR(0x000500EC, "Encountered a ATK-LCatk value of zero; "
+              "something is wrong.");
+          if (LCatk > 1)
+            OJPH_ERROR(0x000500ED, "ATK-LCatk value greater than 1; "
+              "that is, a multitap filter is not supported");
+          if (read_coefficient(file, d[s].rev.Aatk, bytes) == false)
+            OJPH_ERROR(0x000500EE, "Error reding ATK-Aatk parameter");
+        }
+      }
+      else
+      {
+        for (int s = 0; s < Natk; ++s)
+        {
+          ui8 LCatk;
+          if (file->read(&LCatk, 1) != 1)
+            OJPH_ERROR(0x000500EF, "error reading ATK-LCatk parameter");
+          bytes -= 1;
+          if (LCatk == 0)
+            OJPH_ERROR(0x000500F0, "Encountered a ATK-LCatk value of zero; "
+              "something is wrong.");
+          if (LCatk > 1)
+            OJPH_ERROR(0x000500F1, "ATK-LCatk value greater than 1; "
+              "that is, a multitap filter is not supported.");
+          if (read_coefficient(file, d[s].irv.Aatk, bytes) == false)
+            OJPH_ERROR(0x000500F2, "Error reding ATK-Aatk parameter");
+        }
+      }
+      if (bytes != 0)
+        OJPH_ERROR(0x000500F3, "The length of an ATK marker segment "
+          "(ATK-Latk) is not correct");
+
+      return true;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_atk::init_irv97()
+    {
+      Satk = 0x4a00;     // illegal because ATK = 0
+      Katk = (float)1.230174104914001;
+      Natk = 4;
+      // next is (A-4) in T.801 second line
+      Latk = (ui16)(5 + Natk + sizeof(float) * (1 + Natk));
+      d[0].irv.Aatk = (float)0.443506852043971;
+      d[1].irv.Aatk = (float)0.882911075530934;
+      d[2].irv.Aatk = (float)-0.052980118572961;
+      d[3].irv.Aatk = (float)-1.586134342059924;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_atk::init_rev53()
+    {
+      Satk = 0x5801;     // illegal because ATK = 1
+      Natk = 2;
+      // next is (A-4) in T.801 fourth line
+      Latk = (ui16)(5 + 2 * Natk + sizeof(ui8) * (Natk + Natk));
+      d[0].rev.Aatk = 1;
+      d[0].rev.Batk = 2;
+      d[0].rev.Eatk = 2;
+      d[1].rev.Aatk = -1;
+      d[1].rev.Batk = 1;
+      d[1].rev.Eatk = 1;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    param_atk* param_atk::add_object()
+    {
+      assert(top_atk = NULL);
+      param_atk *p = this;
+      while (p->next != NULL)
+        p = p->next;
+      if (avail)
+      {
+        p->next = avail;
+        avail = avail->next;
+      }
+      else
+        p->next = new param_atk;
+      p = p->next;
+      p->init(this);
+      return p;
+    }
 
-}
+  } // !local namespace
+}  // !ojph namespace
diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index 2450b00d..2c8a3a0a 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -95,15 +95,26 @@ namespace ojph {
   const char OJPH_PN_STRING_IMF[] = "IMF";
 
   ////////////////////////////////////////////////////////////////////////////
-  enum OJPH_TILEPART_DIVISONS: ui32 {
-    OJPH_TILEPART_NODIVSIONS  = 0x0, // no divisions to tile parts
-    OJPH_TILEPART_RESOLUTIONS = 0x1,
-    OJPH_TILEPART_COMPONENTS  = 0x2,
-    OJPH_TILEPART_LAYERS      = 0x4, // these are meaningless with HTJ2K
+  enum OJPH_TILEPART_DIVISIONS: ui32 {
+    OJPH_TILEPART_NO_DIVISIONS = 0x0, // no divisions to tile parts
+    OJPH_TILEPART_RESOLUTIONS  = 0x1,
+    OJPH_TILEPART_COMPONENTS   = 0x2,
+    OJPH_TILEPART_LAYERS       = 0x4, // these are meaningless with HTJ2K
+    OJPH_TILEPART_MASK         = 0x3, // mask used for testing
   };
 
   namespace local {
 
+    //defined here
+    struct param_siz;
+    struct param_cod;
+    struct param_qcd;
+    struct param_cap;
+    struct param_sot;
+    struct param_tlm;
+    struct param_dfs;
+    struct param_atk;
+
     //////////////////////////////////////////////////////////////////////////
     enum JP2K_MARKER : ui16
     {
@@ -111,6 +122,7 @@ namespace ojph {
       CAP = 0xFF50, //extended capability
       SIZ = 0xFF51, //image and tile size (required)
       COD = 0xFF52, //coding style default (required)
+      COC = 0xFF53, //coding style component
       TLM = 0xFF55, //tile-part lengths
       PRF = 0xFF56, //profile
       PLM = 0xFF57, //packet length, main header
@@ -118,19 +130,21 @@ namespace ojph {
       CPF = 0xFF59, //corresponding profile values
       QCD = 0xFF5C, //qunatization default (required)
       QCC = 0xFF5D, //quantization component
+      RGN = 0xFF5E, //region of interest
+      POC = 0xFF5F, //progression order change
+      PPM = 0xFF60, //packed packet headers, main header
+      PPT = 0xFF61, //packed packet headers, tile-part header
+      CRG = 0xFF63, //component registration
       COM = 0xFF64, //comment
+      DFS = 0xFF72, //downsampling factor styles
+      ADS = 0xFF73, //arbitrary decomposition styles
+      NLT = 0xFF76, //non-linearity point transformation
+      ATK = 0xFF79, //arbitrary transformation kernels
       SOT = 0xFF90, //start of tile-part
       SOP = 0xFF91, //start of packet
       EPH = 0xFF92, //end of packet
       SOD = 0xFF93, //start of data
       EOC = 0xFFD9, //end of codestream (required)
-
-      COC = 0xFF53, //coding style component
-      RGN = 0xFF5E, //region of interest
-      POC = 0xFF5F, //progression order change
-      PPM = 0xFF60, //packed packet headers, main header
-      PPT = 0xFF61, //packed packet headers, tile-part header
-      CRG = 0xFF63, //component registration
     };
 
     //////////////////////////////////////////////////////////////////////////
@@ -153,18 +167,32 @@ namespace ojph {
       friend ::ojph::param_siz;
 
     public:
-      param_siz()
+      enum : ui16 {
+        RSIZ_NLT_FLAG  =  0x200,
+        RSIZ_HT_FLAG   = 0x4000,
+        RSIZ_EXT_FLAG  = 0x8000,
+      };
+
+    public:
+      param_siz() { init(); }
+      ~param_siz() { destroy(); }
+
+      void init()
       {
-        memset(this, 0, sizeof(param_siz));
+        Lsiz = Csiz = 0;
+        Xsiz = Ysiz = XOsiz = YOsiz = XTsiz = YTsiz = XTOsiz = YTOsiz = 0;
+        skipped_resolutions = 0;
+        memset(store, 0, sizeof(store));
+        ws_kern_support_needed = dfs_support_needed = false;
+        cod = NULL;
+        dfs = NULL;
+        Rsiz = RSIZ_HT_FLAG;
         cptr = store;
-        old_Csiz = 4;
-        Rsiz = 0x4000; //for jph, bit 14 of Rsiz is 1
+        old_Csiz = sizeof(store) / sizeof(siz_comp_info);
       }
 
-      ~param_siz()
-      {
-        if (cptr != store) delete[] cptr;
-      }
+      void destroy()
+      { if (cptr != store) { delete[] cptr; cptr = NULL; } }
 
       void set_num_components(ui32 num_comps)
       {
@@ -189,19 +217,35 @@ namespace ojph {
         cptr[comp_num].YRsiz = (ui8)downsampling.y;
       }
 
+      void set_image_extent(point dims) { Xsiz = dims.x; Ysiz = dims.y; }
+      point get_image_extent() const { return point(Xsiz, Ysiz); }
+      void set_tile_size(size s) { XTsiz = s.w; YTsiz = s.h; }
+      size get_tile_size() const { return size(XTsiz, YTsiz); }
+      void set_image_offset(point offset)
+      { XOsiz = offset.x; YOsiz = offset.y; }
+      point get_image_offset() const
+      { return point(XOsiz, YOsiz); }
+      void set_tile_offset(point offset)
+      { XTOsiz = offset.x; YTOsiz = offset.y; }
+      point get_tile_offset() const
+      { return point(XTOsiz, YTOsiz); }
+
+      void set_cod(const param_cod& cod) { this->cod = &cod; }
+
       void check_validity()
       {
-        if (XTsiz == 0 && YTsiz == 0)
-        { XTsiz = Xsiz - XOsiz; YTsiz = Ysiz - YOsiz; }
         if (Xsiz == 0 || Ysiz == 0 || XTsiz == 0 || YTsiz == 0)
-          OJPH_ERROR(0x00040001, 
-            "You cannot set image extent nor tile size to zero");
+          OJPH_ERROR(0x00040001,
+            "Image extent and/or tile size cannot be zero");
         if (XTOsiz > XOsiz || YTOsiz > YOsiz)
           OJPH_ERROR(0x00040002,
-            "tile offset has to be smaller than image offset");
+            "Tile offset has to be smaller than the image offset");
         if (XTsiz + XTOsiz <= XOsiz || YTsiz + YTOsiz <= YOsiz)
           OJPH_ERROR(0x00040003,
-            "the top left tile must intersect with the image");
+            "The top left tile must intersect with the image");
+        if (Xsiz <= XOsiz || Ysiz <= YOsiz)
+          OJPH_ERROR(0x00040004,
+            "The image extent must be larger than the image offset");
       }
 
       ui16 get_num_components() const { return Csiz; }
@@ -224,10 +268,15 @@ namespace ojph {
       bool write(outfile_base *file);
       void read(infile_base *file);
 
+      void link(const param_cod* cod)
+      { this->cod = cod; }
+
+      void link(const param_dfs* dfs)
+      { this->dfs = dfs; }
+
       void set_skipped_resolutions(ui32 skipped_resolutions)
-      {
-        this->skipped_resolutions = skipped_resolutions;
-      }
+      { this->skipped_resolutions = skipped_resolutions; }
+
       ui32 get_width(ui32 comp_num) const
       {
         assert(comp_num < get_num_components());
@@ -235,6 +284,7 @@ namespace ojph {
         ui32 t = ojph_div_ceil(Xsiz, ds) - ojph_div_ceil(XOsiz, ds);
         return t;
       }
+
       ui32 get_height(ui32 comp_num) const
       {
         assert(comp_num < get_num_components());
@@ -242,20 +292,21 @@ namespace ojph {
         ui32 t = ojph_div_ceil(Ysiz, ds) - ojph_div_ceil(YOsiz, ds);
         return t;
       }
+
+      point get_recon_downsampling(ui32 comp_num) const;
+      point get_recon_size(ui32 comp_num) const;
       ui32 get_recon_width(ui32 comp_num) const
-      {
-        assert(comp_num < get_num_components());
-        ui32 ds = (ui32)cptr[comp_num].XRsiz * (1u << skipped_resolutions);
-        ui32 t = ojph_div_ceil(Xsiz, ds) - ojph_div_ceil(XOsiz, ds);
-        return t;
-      }
+      { return get_recon_size(comp_num).x; }
       ui32 get_recon_height(ui32 comp_num) const
-      {
-        assert(comp_num < get_num_components());
-        ui32 ds = (ui32)cptr[comp_num].YRsiz * (1u << skipped_resolutions);
-        ui32 t = ojph_div_ceil(Ysiz, ds) - ojph_div_ceil(YOsiz, ds);
-        return t;
-      }
+      { return get_recon_size(comp_num).y; }
+
+      bool is_ws_kern_support_needed() { return ws_kern_support_needed; }
+      bool is_dfs_support_needed() { return dfs_support_needed; }
+
+      void set_Rsiz_flag(ui16 flag)
+      { Rsiz |= flag; }
+      void reset_Rsiz_flag(ui16 flag)
+      { Rsiz = (ui16)(Rsiz & ~flag); }
 
     private:
       ui16 Lsiz;
@@ -275,6 +326,10 @@ namespace ojph {
       ui32 skipped_resolutions;
       int old_Csiz;
       siz_comp_info store[4];
+      bool ws_kern_support_needed;
+      bool dfs_support_needed;
+      const param_cod* cod;
+      const param_dfs* dfs;
       param_siz(const param_siz&) = delete; //prevent copy constructor
       param_siz& operator=(const param_siz&) = delete; //prevent copy
     };
@@ -288,20 +343,38 @@ namespace ojph {
     ///////////////////////////////////////////////////////////////////////////
     struct cod_SPcod
     {
+      cod_SPcod() {
+        num_decomp = 5;
+        block_width = 4;    // 64
+        block_height = 4;   // 64
+        block_style = 0x40; // HT mode
+        wavelet_trans = 0;  // reversible 5 / 3
+        memset(precinct_size, 0, sizeof(precinct_size));
+      }
+
       ui8 num_decomp;
       ui8 block_width;
       ui8 block_height;
       ui8 block_style;
       ui8 wavelet_trans;
       ui8 precinct_size[33]; //num_decomp is in [0,32]
-    };
 
-    ///////////////////////////////////////////////////////////////////////////
-    typedef cod_SPcod cod_SPcoc;
+      size get_log_block_dims() const
+      { return size(block_width + 2, block_height + 2); }
+      size get_block_dims() const
+      { size t = get_log_block_dims(); return size(1 << t.w, 1 << t.h); }
+      size get_log_precinct_size(ui32 res_num) const
+      {
+        assert(res_num <= num_decomp);
+        size ps(precinct_size[res_num] & 0xF, precinct_size[res_num] >> 4);
+        return ps;
+      }
+    };
 
     ///////////////////////////////////////////////////////////////////////////
     struct cod_SGcod
     {
+      cod_SGcod() : prog_order(OJPH_PO_RPCL), num_layers(1), mc_trans(0) {}
       ui8 prog_order;
       ui16 num_layers;
       ui8 mc_trans;
@@ -310,38 +383,67 @@ namespace ojph {
     ///////////////////////////////////////////////////////////////////////////
     struct param_cod
     {
+      // serves for both COD and COC markers
       friend ::ojph::param_cod;
+      enum default_comp_num : ui16 {
+        OJPH_COD_UNKNOWN = 65534,
+        OJPH_COD_DEFAULT = 65535
+      };
+
+      ////////////////////////////////////////
       enum BLOCK_CODING_STYLES {
         VERT_CAUSAL_MODE = 0x8,
         HT_MODE = 0x40
       };
-    public:
-      param_cod()
+      ////////////////////////////////////////
+      enum cod_type : ui8 {
+        UNDEFINED = 0,
+        COD_MAIN  = 1,
+        COC_MAIN  = 2,
+        COD_TILE  = 3,  // not implemented
+        COC_TILE  = 4   // not implemented
+      };
+      ////////////////////////////////////////
+      enum dwt_type : ui8 {
+        DWT_IRV97 = 0,
+        DWT_REV53 = 1,
+      };
+
+    public: // COD_MAIN and COC_MAIN common functions
+      param_cod(param_cod* top_cod = NULL, ui16 comp_idx = OJPH_COD_DEFAULT)
+      { avail = NULL; init(top_cod, comp_idx); }
+      ~param_cod() { destroy(); }
+
+      ////////////////////////////////////////
+      void restart()
       {
-        memset(this, 0, sizeof(param_cod));
-        SPcod.block_style = HT_MODE;
-        SGCod.prog_order = 2;
-        SGCod.num_layers = 1;
-        SGCod.mc_trans = 0;
-        SPcod.num_decomp = 5;
-        SPcod.block_width = 4; //64
-        SPcod.block_height = 4; //64
-        set_reversible(false);
+        param_cod** p = &avail; // move next to the end of avail
+        while (*p != NULL)
+          p = &((*p)->next);
+        *p = next;
+        this->init(top_cod, OJPH_COD_DEFAULT);
       }
 
+      ////////////////////////////////////////
       void set_reversible(bool reversible)
       {
-        SPcod.wavelet_trans = reversible ? 1 : 0;
+        assert(type == UNDEFINED || type == COD_MAIN || type == COC_MAIN);
+        SPcod.wavelet_trans = reversible ? DWT_REV53 : DWT_IRV97;
       }
 
+      ////////////////////////////////////////
       void employ_color_transform(ui8 val)
       {
         assert(val == 0 || val == 1);
+        assert(type == UNDEFINED || type == COD_MAIN);
         SGCod.mc_trans = val;
       }
 
+      ////////////////////////////////////////
       void check_validity(const param_siz& siz)
       {
+        assert(type == COD_MAIN);
+
         //check that colour transform and match number of components and
         // downsampling
         int num_comps = siz.get_num_components();
@@ -352,21 +454,40 @@ namespace ojph {
 
         if (SGCod.mc_trans == 1)
         {
-          bool test = false;
+          bool test_signedness = false;
+          bool test_bit_depth = false;
+          bool test_downsampling = false;
           point p = siz.get_downsampling(0);
+          ui32 bit_depth = siz.get_bit_depth(0);
+          bool is_signed = siz.is_signed(0);
           for (ui32 i = 1; i < 3; ++i)
           {
             point p1 = siz.get_downsampling(i);
-            test = test || (p.x != p1.x || p.y != p1.y);
+            test_downsampling = test_downsampling
+              || (p.x != p1.x || p.y != p1.y);
+            test_bit_depth = test_bit_depth
+              || (bit_depth != siz.get_bit_depth(i));
+            test_signedness = test_signedness
+              || (is_signed != siz.is_signed(i));
           }
-          if (test)
+          if (test_downsampling)
             OJPH_ERROR(0x00040012,
-              "when color transform is used, the first 3 colour "
-              "components must have the same downsampling.");
+              "when color transform is used, the first 3 colour components "
+              "must have the same downsampling factor.");
+          if (test_bit_depth)
+            OJPH_ERROR(0x00040014,
+              "when color transform is used, the first 3 colour components "
+              "must have the same bit depth.");
+          if (test_signedness)
+            OJPH_ERROR(0x00040015,
+              "when color transform is used, the first 3 colour components "
+              "must have the same signedness (signed or unsigned).");
+
         }
 
         //check the progression order matches downsampling
-        if (SGCod.prog_order == 2 || SGCod.prog_order == 3)
+        if (SGCod.prog_order == OJPH_PO_RPCL ||
+            SGCod.prog_order == OJPH_PO_PCRL)
         {
           ui32 num_comps = siz.get_num_components();
           for (ui32 i = 0; i < num_comps; ++i)
@@ -379,50 +500,170 @@ namespace ojph {
         }
       }
 
+      ////////////////////////////////////////
       ui8 get_num_decompositions() const
-      { return SPcod.num_decomp; }
-      size get_block_dims() const
       {
-        return size(1 << (SPcod.block_width + 2),
-                    1 << (SPcod.block_height + 2));
+        if (type == COD_MAIN)
+          return SPcod.num_decomp;
+        else if (type == COC_MAIN)
+        {
+          if (is_dfs_defined())
+            return top_cod->get_num_decompositions();
+          else
+            return SPcod.num_decomp;
+        }
+        else {
+          assert(0);
+          return 0; // just in case
+        }
       }
-      bool is_reversible() const
-      { return (SPcod.wavelet_trans == 1); }
-      bool is_employing_color_transform() const
-      { return (SGCod.mc_trans == 1); }
+
+      ////////////////////////////////////////
+      size get_block_dims() const
+      { return SPcod.get_block_dims(); }
+
+      ////////////////////////////////////////
       size get_log_block_dims() const
-      { return size(SPcod.block_width + 2, SPcod.block_height + 2); }
+      { return SPcod.get_log_block_dims(); }
+
+      ////////////////////////////////////////
+      ui8 get_wavelet_kern() const
+      { return SPcod.wavelet_trans; }
+
+      ////////////////////////////////////////
+      bool is_reversible() const;
+
+      ////////////////////////////////////////
+      bool is_employing_color_transform() const
+      {
+        if (type == COD_MAIN || type == COD_TILE)
+          return (SGCod.mc_trans == 1);
+        else
+          return top_cod->is_employing_color_transform();
+      }
+
+      ////////////////////////////////////////
       size get_precinct_size(ui32 res_num) const
       {
         size t = get_log_precinct_size(res_num);
-        t.w = 1 << t.w;
-        t.h = 1 << t.h;
-        return t;
+        return size(1 << t.w, 1 << t.h);
       }
+
+      ////////////////////////////////////////
       size get_log_precinct_size(ui32 res_num) const
       {
-        assert(res_num <= SPcod.num_decomp);
-        size ps(15, 15);
         if (Scod & 1)
-        {
-          ps.w = SPcod.precinct_size[res_num] & 0xF;
-          ps.h = SPcod.precinct_size[res_num] >> 4;
-        }
-        return ps;
+          return SPcod.get_log_precinct_size(res_num);
+        else
+          return size(15, 15);
       }
+
+      ////////////////////////////////////////
       bool packets_may_use_sop() const
-      { return (Scod & 2) == 2; }
+      {
+        if (type == COD_MAIN || type == COD_TILE)
+          return (Scod & 2) == 2;
+        return false;
+      }
+
+      ////////////////////////////////////////
       bool packets_use_eph() const
-      { return (Scod & 4) == 4; }
+      {
+        if (type == COD_MAIN || type == COD_TILE)
+          return (Scod & 4) == 4;
+        return false;
+      }
 
+      ////////////////////////////////////////
+      bool get_block_vertical_causality() const
+      { return (SPcod.block_style & local::param_cod::VERT_CAUSAL_MODE) != 0; }
+
+      ////////////////////////////////////////
       bool write(outfile_base *file);
+
+      ////////////////////////////////////////
+      bool write_coc(outfile_base *file, ui32 num_comps);
+
+      ////////////////////////////////////////
       void read(infile_base *file);
 
+      ////////////////////////////////////////
+      void read_coc(infile_base* file, ui32 num_comps, param_cod* top_cod);
+
+      ////////////////////////////////////////
+      void update_atk(param_atk* atk);
+
+      ////////////////////////////////////////
+      const param_cod* get_coc(ui32 comp_idx) const;
+
+      ////////////////////////////////////////
+      param_cod* get_coc(ui32 comp_idx);
+
+      ////////////////////////////////////////
+      param_cod* add_coc_object(ui32 comp_idx);
+
+      ////////////////////////////////////////
+      const param_atk* access_atk() const { return atk; }
+
+    public: // COC_MAIN only functions
+      ////////////////////////////////////////
+      bool is_dfs_defined() const
+      { return (SPcod.num_decomp & 0x80) != 0; }
+
+      ////////////////////////////////////////
+      ui16 get_dfs_index() const  // cannot be more than 15
+      { return SPcod.num_decomp & 0xF; }
+
+      ////////////////////////////////////////
+      ui32 get_comp_idx() const
+      {
+        assert((type == COC_MAIN && comp_idx != OJPH_COD_DEFAULT) ||
+               (type == COD_MAIN && comp_idx == OJPH_COD_DEFAULT));
+        return comp_idx;
+      }
+
+    private:
+      ////////////////////////////////////////
+      void init(param_cod* top_cod, ui16 comp_idx)
+      {
+        type = top_cod ? COC_MAIN : COD_MAIN;
+        Lcod = 0;
+        Scod = 0;
+        next = NULL;
+        atk = NULL;
+        this->top_cod = top_cod;
+        this->comp_idx = comp_idx;
+      }
+
+      ////////////////////////////////////////
+      void destroy() {
+        if (avail)
+          delete avail;
+        if (next) {
+          delete next;
+          next = NULL;
+        }
+      }
+
     private:
-      ui16 Lcod;
-      ui8 Scod;
-      cod_SGcod SGCod;
-      cod_SPcod SPcod;
+      bool internal_write_coc(outfile_base *file, ui32 num_comps);
+
+    ////////////////////////////////////////
+    private: // Common variables
+      cod_type type;        // The type of this cod structure
+      ui16 Lcod;            // serves as Lcod and Scod
+      ui8 Scod;             // serves as Scod and Scoc
+      cod_SGcod SGCod;      // Used in COD and copied to COC
+      cod_SPcod SPcod;      // serves as SPcod and SPcoc
+      param_cod* next;      // to chain coc parameters to cod
+      const param_atk* atk; // used to read transform information
+
+    private: // COC only variables
+      param_cod* top_cod;   // parent COD structure
+      ui16 comp_idx;        // component index of this COC structure
+
+    private: // on restart, already allocated param_cod objs are stored here
+      param_cod* avail;
     };
 
     ///////////////////////////////////////////////////////////////////////////
@@ -434,63 +675,119 @@ namespace ojph {
     ///////////////////////////////////////////////////////////////////////////
     struct param_qcd
     {
+      // serves for both QCD and QCC markers
       friend ::ojph::param_qcd;
+      enum default_comp_num : ui16 {
+        OJPH_QCD_UNKNOWN = 65534,
+        OJPH_QCD_DEFAULT = 65535
+      };
+
+      ////////////////////////////////////////
+      enum qcd_type : ui8 {
+        UNDEFINED = 0,
+        QCD_MAIN  = 1,
+        QCC_MAIN  = 2,
+        QCD_TILE  = 3,  // not implemented
+        QCC_TILE  = 4   // not implemented
+      };
+
     public:
-      param_qcd()
-      { 
-        Lqcd = 0;
-        Sqcd = 0;
-        for (int i = 0; i < 97; ++i)
-          u16_SPqcd[i] = 0;
-        num_decomps = 0;
-        base_delta = -1.0f; 
+      param_qcd(param_qcd* top_qcd = NULL, ui16 comp_idx = OJPH_QCD_DEFAULT)
+      { avail = NULL; init(top_qcd, comp_idx); }
+      ~param_qcd() { destroy(); }
+
+      ////////////////////////////////////////
+      void restart()
+      {
+        param_qcd** p = &avail; // move next to the end of avail
+        while (*p != NULL)
+          p = &((*p)->next);
+        *p = next;
+        this->init(top_qcd, OJPH_QCD_DEFAULT);
       }
 
+      void check_validity(const param_siz& siz, const param_cod& cod);
       void set_delta(float delta) { base_delta = delta; }
-      void set_rev_quant(ui32 bit_depth, bool is_employing_color_transform);
-      void set_irrev_quant();
+      void set_delta(ui32 comp_idx, float delta);
+      ui32 get_num_guard_bits() const;
+      ui32 get_MAGB() const;
+      ui32 get_Kmax(const param_dfs* dfs, ui32 num_decompositions,
+                    ui32 resolution, ui32 subband) const;
+      ui32 propose_precision(const param_cod* cod) const;
+      float get_irrev_delta(const param_dfs* dfs,
+                            ui32 num_decompositions,
+                            ui32 resolution, ui32 subband) const;
+      bool write(outfile_base *file);
+      bool write_qcc(outfile_base *file, ui32 num_comps);
+      void read(infile_base *file);
+      void read_qcc(infile_base *file, ui32 num_comps);
+
+      param_qcd* get_qcc(ui32 comp_idx);
+      const param_qcd* get_qcc(ui32 comp_idx) const;
+      param_qcd* add_qcc_object(ui32 comp_idx);
+      ui16 get_comp_idx() const { return comp_idx; }
 
-      void check_validity(const param_siz& siz, const param_cod& cod)
+    private:
+      ////////////////////////////////////////
+      void init(param_qcd* top_qcd, ui16 comp_idx)
       {
-        num_decomps = cod.get_num_decompositions();
-        if (cod.is_reversible())
+        type = top_qcd ? QCC_MAIN : QCD_MAIN;
+        Lqcd = 0;
+        Sqcd = 0;
+        memset(&SPqcd, 0, sizeof(SPqcd));
+        num_subbands = 0;
+        base_delta = -1.0f;
+        enabled = true;
+        next = NULL;
+        this->top_qcd = top_qcd;
+        this->comp_idx = comp_idx;
+      }
+
+      ////////////////////////////////////////
+      void destroy() {
+        if (avail)
+          delete avail;
+        if (next)
         {
-          ui32 bit_depth = 0;
-          for (ui32 i = 0; i < siz.get_num_components(); ++i)
-            bit_depth = ojph_max(bit_depth, siz.get_bit_depth(i));
-          set_rev_quant(bit_depth, cod.is_employing_color_transform());
+          delete next;
+          next = NULL;
         }
-        else
-        {
-          if (base_delta == -1.0f) {
-            ui32 bit_depth = 0;
-            for (ui32 i = 0; i < siz.get_num_components(); ++i)
-              bit_depth =
-                ojph_max(bit_depth, siz.get_bit_depth(i) + siz.is_signed(i));
-            base_delta = 1.0f / (float)(1 << bit_depth);
-          }
-          set_irrev_quant();
-         }
       }
 
-      ui32 get_num_guard_bits() const;
-      ui32 get_MAGBp() const;
-      ui32 get_Kmax(ui32 resolution, ui32 subband) const;
-      float irrev_get_delta(ui32 resolution, ui32 subband) const;
+    private:
+      void set_rev_quant(ui32 num_decomps, ui32 bit_depth,
+                         bool is_employing_color_transform);
+      void set_irrev_quant(ui32 num_decomps);
+      ui32 get_largest_Kmax() const;
+      bool internal_write_qcc(outfile_base *file, ui32 num_comps);
+      void trim_non_existing_components(ui32 num_comps);
 
-      bool write(outfile_base *file);
-      void read(infile_base *file);
+      ui8 decode_SPqcd(ui8 v) const
+      { return (ui8)(v >> 3); }
+      ui8 encode_SPqcd(ui8 v) const
+      { return (ui8)(v << 3); }
 
-    protected:
+    private: // QCD variables
+      qcd_type type;
       ui16 Lqcd;
       ui8 Sqcd;
       union
       {
-        ui8 u8_SPqcd[97];
-        ui16 u16_SPqcd[97];
-      };
-      ui32 num_decomps;
-      float base_delta;
+        ui8 u8[97];
+        ui16 u16[97];
+      } SPqcd;
+      ui32 num_subbands;  // number of subbands
+      float base_delta;   // base quantization step size -- all other
+                          // step sizes are derived from it.
+      bool enabled;       // enabled if two, and ignored if false
+      param_qcd *next;    // pointer to create chains of qcc marker segments
+      param_qcd *top_qcd; // pointer to the top QCD (this is the default)
+
+    private: // QCC only variables
+      ui16 comp_idx;
+
+    private:  // on restart, already allocated param_qcd objs are stored here
+      param_qcd* avail;
     };
 
     ///////////////////////////////////////////////////////////////////////////
@@ -500,18 +797,76 @@ namespace ojph {
     //
     //
     ///////////////////////////////////////////////////////////////////////////
-    struct param_qcc : public param_qcd
+    // data structures used by param_nlt
+    struct param_nlt
     {
-      //friend ::ojph::param_qcc;
+      using special_comp_num = ojph::param_nlt::special_comp_num;
+      using nonlinearity = ojph::param_nlt::nonlinearity;
     public:
-      param_qcc() : param_qcd()
-      { comp_idx = 0; }
+      param_nlt() { avail = NULL; init(); }
+      ~param_nlt() { destroy(); }
+
+      ////////////////////////////////////////
+      void restart()
+      {
+        param_nlt** p = &avail; // move next to the end of avail
+        while (*p != NULL)
+          p = &((*p)->next);
+        *p = next;
+        this->init();
+      }
+
+      void check_validity(param_siz& siz);
+      void set_nonlinear_transform(ui32 comp_num, ui8 nl_type);
+      bool get_nonlinear_transform(ui32 comp_num, ui8& bit_depth,
+                                   bool& is_signed, ui8& nl_type) const;
+      bool write(outfile_base* file) const;
+      void read(infile_base* file);
+
+    private:
+      ////////////////////////////////////////
+      void init()
+      {
+        Lnlt = 6;
+        Cnlt = special_comp_num::ALL_COMPS; // default
+        BDnlt = 0;
+        Tnlt = nonlinearity::OJPH_NLT_UNDEFINED;
+        enabled = false; next = NULL;
+      }
+
+      ////////////////////////////////////////
+      void destroy()
+      {
+        if (avail)
+          delete avail;
+        if (next) {
+          delete next;
+          next = NULL;
+        }
+      }
+
+    private:
+      const param_nlt* get_nlt_object(ui32 comp_num) const;
+      param_nlt* get_nlt_object(ui32 comp_num);
+      param_nlt* add_object(ui32 comp_num);
+      bool is_any_enabled() const;
+      void trim_non_existing_components(ui32 num_comps);
+
+    private:
+      ui16 Lnlt;         // length of the marker segment excluding marker
+      ui16 Cnlt;         // Component involved in the transformation
+      ui8 BDnlt;         // Decoded image component bit depth parameter
+      ui8 Tnlt;          // Type of non-linearity
+      bool enabled;      // true if this object is used
+      param_nlt* next;   // for chaining NLT markers
 
-      ui16 get_comp_num() { return comp_idx; }
-      void read(infile_base *file, ui32 num_comps);
+      // The top level param_nlt object is not allocated, but as part of
+      // codestream, and is used to manage allocated next objects.
+      // next holds a list of param_nlt objects, which are managed by the top
+      // param_nlt object.
 
-    protected:
-        ui16 comp_idx;
+    private: // on restart, already allocated param_nlt objs are stored here
+      param_nlt* avail;
     };
 
     ///////////////////////////////////////////////////////////////////////////
@@ -533,21 +888,19 @@ namespace ojph {
 
       void check_validity(const param_cod& cod, const param_qcd& qcd)
       {
-        if (cod.is_reversible())
+        if (cod.get_wavelet_kern() == param_cod::DWT_REV53)
           Ccap[0] &= 0xFFDF;
         else
           Ccap[0] |= 0x0020;
         Ccap[0] &= 0xFFE0;
         ui32 Bp = 0;
-        ui32 B = qcd.get_MAGBp();
+        ui32 B = qcd.get_MAGB();
         if (B <= 8)
           Bp = 0;
         else if (B < 28)
           Bp = B - 8;
-        else if (B < 48)
-          Bp = 13 + (B >> 2);
         else
-          Bp = 31;
+          Bp = 13 + (B >> 2);
         Ccap[0] = (ui16)(Ccap[0] | (ui16)Bp);
       }
 
@@ -627,9 +980,215 @@ namespace ojph {
       Ttlm_Ptlm_pair* pairs;
       ui32 num_pairs;
       ui32 next_pair_index;
-      
     };
-  }
-}
+
+    ///////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    ///////////////////////////////////////////////////////////////////////////
+    struct param_dfs
+    {
+    public:
+      enum dfs_dwt_type : ui8 {
+        NO_DWT    = 0,  // no wavelet transform
+        BIDIR_DWT = 1,  // bidirectional DWT (this the conventional DWT)
+        HORZ_DWT  = 2,  // horizontal only DWT transform
+        VERT_DWT  = 3,  // vertical only DWT transform
+      };
+
+    public: // member functions
+      param_dfs() { avail = NULL; init(); }
+      ~param_dfs() { destroy(); }
+
+      ////////////////////////////////////////
+      void restart()
+      {
+        param_dfs** p = &avail; // move next to the end of avail
+        while (*p != NULL)
+          p = &((*p)->next);
+        *p = next;
+        this->init();
+      }
+
+      bool read(infile_base *file);
+      bool exists() const { return Ldfs != 0; }
+
+      // get_dfs return a dfs structure Sdfs == index, or NULL if not found
+      const param_dfs* get_dfs(int index) const;
+      // decomp_level is the decomposition level, starting from 1 for highest
+      // resolution to num_decomps for the coarsest resolution
+      dfs_dwt_type get_dwt_type(ui32 decomp_level) const;
+      ui32 get_subband_idx(ui32 num_decompositions, ui32 resolution,
+                           ui32 subband) const;
+      point get_res_downsamp(ui32 skipped_resolutions) const;
+
+    private:
+      ////////////////////////////////////////
+      void init()
+      { Ldfs = Sdfs = Ids = 0; memset(Ddfs, 0, sizeof(Ddfs)); next = NULL; }
+
+      ////////////////////////////////////////
+      void destroy()
+      {
+        if (avail)
+          delete avail;
+        if (next) {
+          delete next;
+          next = NULL;
+        }
+      }
+
+    private: // member variables
+      ui16 Ldfs;       // length of the segment marker
+      ui16 Sdfs;       // index of this DFS marker segment
+      ui8 Ids;         // number of elements in Ddfs, 2 bits per sub-level
+      ui8 Ddfs[8];     // a string defining number of decomposition sub-levels
+                       // 8 bytes should be enough for 32 levels
+      param_dfs* next; // used for linking other dfs segments
+
+    private: // on restart, already allocated param_dfs objs are stored here
+      param_dfs* avail;
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    ///////////////////////////////////////////////////////////////////////////
+    // data structures used by param_atk
+
+    union lifting_step {
+      struct irv_data {
+        // si8 Oatk;     // only for arbitrary filter
+        // ui8 LCatk;    // number of lifting coefficients in a step
+        float Aatk;      // lifting coefficient
+      };
+
+      struct rev_data {
+        // si8 Oatk;     // only for arbitrary filter, offset of filter
+        ui8 Eatk;        // only for reversible, epsilon, the power of 2
+        si16 Batk;       // only for reversible, beta, the additive residue
+        // ui8 LCatk;    // number of lifting coefficients in a step
+        si16 Aatk;       // lifting coefficient
+      };
+
+      irv_data irv;
+      rev_data rev;
+    };
+
+    struct param_atk
+    {
+      // Limitations:
+      // Arbitrary filters (ARB) are not supported
+      // Only one coefficient per step -- first order filter
+      // Only even-indexed subsequence in first reconstruction step,
+      //   m_init = 0 is supported
+    public: // member functions
+      param_atk()
+      {
+        d = d_store;
+        max_steps = sizeof(d_store) / sizeof(lifting_step);
+        init(NULL);
+      }
+      ~param_atk()
+      {
+        if (avail) {
+          delete avail;
+          avail = NULL;
+        }
+        if (next) {
+          delete next;
+          next = NULL;
+        }
+        if (d != NULL && d != d_store) {
+          delete[] d;
+          d = d_store;
+          max_steps = sizeof(d_store) / sizeof(lifting_step);
+        }
+      }
+
+      ////////////////////////////////////////
+      void restart()
+      {
+        assert(top_atk == NULL);
+
+        Latk = Satk = 0;
+        Katk = 0.0f;
+        Natk = 0;
+        if (d == NULL || d == d_store) {
+          d = d_store;
+          max_steps = sizeof(d_store) / sizeof(lifting_step);
+        }
+        memset(d, 0, max_steps * sizeof(lifting_step));
+
+        param_atk** p = &avail; // move next to the end of avail
+        while (*p != NULL)
+          p = &((*p)->next);
+        *p = next;
+
+        next = NULL;
+      }
+
+      bool read(infile_base *file);
+
+      ui8 get_index() const { return (ui8)(Satk & 0xFF); }
+      int get_coeff_type() const { return (Satk >> 8) & 0x7; }
+      bool is_whole_sample() const { return (Satk & 0x800) != 0; }
+      bool is_reversible() const { return (Satk & 0x1000) != 0; }
+      bool is_m_init0() const { return (Satk & 0x2000) == 0; }
+      bool is_using_ws_extension() const { return (Satk & 0x4000) != 0; }
+      param_atk* get_atk(int index);
+      const lifting_step* get_step(ui32 s) const
+      { assert(s < Natk); return d + s; }
+      ui32 get_num_steps() const { return Natk; }
+      float get_K() const { return Katk; }
+
+  private:
+      /////////////////////////////////////
+      void init(param_atk* top_atk)
+      {
+        Latk = Satk = 0;
+        Katk = 0.0f;
+        Natk = 0;
+        if (d == NULL || d == d_store) {
+          d = d_store;
+          max_steps = sizeof(d_store) / sizeof(lifting_step);
+        }
+        memset(d, 0, max_steps * sizeof(lifting_step));
+        next = NULL;
+        this->top_atk = top_atk;
+        avail = NULL;
+      }
+  private:
+      bool read_coefficient(infile_base *file, float &K, si32& bytes);
+      bool read_coefficient(infile_base *file, si16 &K, si32& bytes);
+
+      void init_irv97();
+      void init_rev53();
+      param_atk* add_object();
+
+    private: // member variables
+      ui16 Latk;         // structure length
+      ui16 Satk;         // carries a variety of information
+      float Katk;        // only for irreversible scaling factor K
+      ui8 Natk;          // number of lifting steps
+      lifting_step* d;   // pointer to data, initialized to d_store
+      ui32 max_steps;    // maximum number of steps without memory allocation
+      lifting_step d_store[6];   // lifting step coefficient
+      param_atk* next;   // used for chaining if more than one atk segment
+                         // exist in the codestream
+      param_atk* top_atk;// This is the top level atk, from which all atk
+                         // objects are derived
+
+    private: // on restart, already allocated param_atk objs are stored here
+      param_atk* avail;
+    };
+  } // !local namespace
+} // !ojph namespace
 
 #endif // !OJPH_PARAMS_LOCAL_H
diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp
index 71b61de8..97cd8292 100644
--- a/src/core/codestream/ojph_precinct.cpp
+++ b/src/core/codestream/ojph_precinct.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -98,11 +98,12 @@ namespace ojph {
       coded_lists *cur_coded_list = NULL;
       ui32 cb_bytes = 0; //cb_bytes;
       ui32 ph_bytes = 0; //precinct header size
-      int sst = num_bands == 3 ? 1 : 0;
-      int send = num_bands == 3 ? 4 : 1;
       int num_skipped_subbands = 0;
-      for (int s = sst; s < send; ++s)
+      for (int s = 0; s < 4; ++s)
       {
+        if (bands[s].empty)
+          continue;
+
         if (cb_idxs[s].siz.w == 0 || cb_idxs[s].siz.h == 0)
           continue;
 
@@ -220,7 +221,9 @@ namespace ojph {
               {
                 int num_zeros = *mmsb_tag.get(x>>levm1, y>>levm1, levm1);
                 num_zeros -= *mmsb_tag.get(x>>cur_lev, y>>cur_lev, cur_lev);
-                bb_put_bits(&bb, 1, num_zeros + 1,
+                bb_put_zeros(&bb, num_zeros,
+                  elastic, cur_coded_list, ph_bytes);
+                bb_put_bits(&bb, 1, 1,
                   elastic, cur_coded_list, ph_bytes);
                 *mmsb_tag_flags.get(x>>levm1, y>>levm1, levm1) = 1;
               }
@@ -251,7 +254,7 @@ namespace ojph {
               bits2 = 32 - (int)count_leading_zeros(cp->pass_length[1]);
             int bits = ojph_max(bits1, bits2 - extra_bit) - 3;
             bits = ojph_max(bits, 0);
-            bb_put_bits(&bb, 0xFFFFFFFEu, bits+1, 
+            bb_put_bits(&bb, 0xFFFFFFFEu, bits+1,
               elastic, cur_coded_list, ph_bytes);
 
             bb_put_bits(&bb, cp->pass_length[0], bits+3,
@@ -271,7 +274,7 @@ namespace ojph {
         ph_bytes += cur_coded_list->buf_size - cur_coded_list->avail_size;
       }
 
-      return coded ? cb_bytes + ph_bytes : 1;
+      return coded ? cb_bytes + ph_bytes : 1; // 1 for empty packet
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -288,10 +291,11 @@ namespace ojph {
         }
 
         //write codeblocks
-        int sst = num_bands == 3 ? 1 : 0;
-        int send = num_bands == 3 ? 4 : 1;
-        for (int s = sst; s < send; ++s)
+        for (int s = 0; s < 4; ++s)
         {
+          if (bands[s].empty)
+            continue;
+
           ui32 band_width = bands[s].num_blocks.w;
           ui32 width = cb_idxs[s].siz.w;
           ui32 height = cb_idxs[s].siz.h;
@@ -332,11 +336,12 @@ namespace ojph {
       if (may_use_sop)
         bb_skip_sop(&bb);
 
-      int sst = num_bands == 3 ? 1 : 0;
-      int send = num_bands == 3 ? 4 : 1;
       bool empty_packet = true;
-      for (int s = sst; s < send; ++s)
+      for (int s = 0; s < 4; ++s)
       {
+        if (bands[s].empty)
+          continue;
+
         if (cb_idxs[s].siz.w == 0 || cb_idxs[s].siz.h == 0)
           continue;
 
@@ -458,46 +463,72 @@ namespace ojph {
             }
             cp->num_passes = num_passes;
 
-            //parse pass lengths
-            //for one pass, one length, but for 2 or 3 passes, two lengths
-            int extra_bit = cp->num_passes > 2 ? 1 : 0;
-            int bits1 = 3;
+            // Parse pass lengths
+            // When number of passes is one, one length.
+            // When number of passes is two or three, two lengths.
+            // When number of passes > 3, we have place holder passes;
+            // In this case, subtract multiples of 3 from the number of
+            // passes; for example, if we have 10 passes, we subtract 9,
+            // producing 1 pass.
+
+            // 1 => 1, 2 => 2, 3 => 3, 4 => 1, 5 => 2, 6 => 3
+            ui32 num_phld_passes = (num_passes - 1) / 3;
+            cp->missing_msbs += num_phld_passes;
+
+            num_phld_passes *= 3;
+            cp->num_passes = num_passes - num_phld_passes;
+            cp->pass_length[0] = cp->pass_length[1] = 0;
+
+            int Lblock = 3;
             bit = 1;
             while (bit)
             {
+              // add any extra bits here
               if (bb_read_bit(&bb, bit) == false)
               { data_left = 0; throw "error reading from file p8"; }
-              bits1 += bit;
+              Lblock += bit;
             }
 
-            if (bb_read_bits(&bb, bits1, bit) == false)
+            int bits = Lblock + 31 -
+              (int)count_leading_zeros(num_phld_passes + 1);
+            if (bb_read_bits(&bb, bits, bit) == false)
             { data_left = 0; throw "error reading from file p9"; }
-            if (bit < 2) { 
+            if (bit < 2)
               throw "The cleanup segment of an HT codeblock cannot contain "
                 "less than 2 bytes";
-            }
-            if (bit >= 65535) {
+            if (bit >= 65535)
               throw "The cleanup segment of an HT codeblock must contain "
                 "less than 65535 bytes";
-            }
             cp->pass_length[0] = bit;
-            if (num_passes > 1)
+
+            if (cp->num_passes > 1)
             {
-              if (bb_read_bits(&bb, bits1 + extra_bit, bit) == false)
+              //bits = Lblock + 31 - count_leading_zeros(cp->num_passes - 1);
+              // The following is simpler than the above, I think?
+              bits = Lblock + (cp->num_passes > 2 ? 1 : 0);
+              if (bb_read_bits(&bb, bits, bit) == false)
               { data_left = 0; throw "error reading from file p10"; }
-              if (bit >= 2047) {
+              if (bit >= 2047)
                 throw "The refinement segment (SigProp and MagRep passes) of "
                   "an HT codeblock must contain less than 2047 bytes";
-              }
               cp->pass_length[1] = bit;
             }
           }
         }
       }
+      if (empty_packet)
+      { // all subbands are empty
+        ui32 bit = 0;
+        bb_read_bit(&bb, bit);
+        //assert(bit == 0);
+      }
       bb_terminate(&bb, uses_eph);
       //read codeblock data
-      for (int s = sst; s < send; ++s)
+      for (int s = 0; s < 4; ++s)
       {
+        if (bands[s].empty)
+          continue;
+
         ui32 band_width = bands[s].num_blocks.w;
         ui32 width = cb_idxs[s].siz.w;
         ui32 height = cb_idxs[s].siz.h;
@@ -518,7 +549,7 @@ namespace ojph {
                   ui32 t = ojph_min(num_bytes, bb.bytes_left);
                   file->seek(t, infile_base::OJPH_SEEK_CUR);
                   ui32 bytes_read = (ui32)(file->tell() - cur_loc);
-                  cp->pass_length[0] = cp->pass_length[1] = 0; 
+                  cp->pass_length[0] = cp->pass_length[1] = 0;
                   bb.bytes_left -= bytes_read;
                   assert(bytes_read == t || bb.bytes_left == 0);
                 }
diff --git a/src/core/codestream/ojph_precinct.h b/src/core/codestream/ojph_precinct.h
index 4641ed68..47ec4736 100644
--- a/src/core/codestream/ojph_precinct.h
+++ b/src/core/codestream/ojph_precinct.h
@@ -59,7 +59,7 @@ namespace ojph {
     {
       precinct() {
         scratch = NULL; bands = NULL; coded = NULL;
-        num_bands = 0; may_use_sop = uses_eph = false;
+        may_use_sop = uses_eph = false;
       }
       ui32 prepare_precinct(int tag_tree_size, ui32* lev_idx,
                             mem_elastic_allocator *elastic);
@@ -69,11 +69,10 @@ namespace ojph {
                  ui32& data_left, infile_base *file, bool skipped);
 
       ui8 *scratch;
-      point img_point;   //the precinct projected to full resolution
+      point img_point; //the precinct projected to full resolution
       rect cb_idxs[4]; //indices of codeblocks
       subband *bands;  //the subbands
       coded_lists* coded;
-      ui32 num_bands;
       bool may_use_sop, uses_eph;
     };
 
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index bde1b511..59a3dfb6 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -38,6 +38,7 @@
 
 #include <climits>
 #include <cmath>
+#include <new>
 
 #include "ojph_mem.h"
 #include "ojph_params.h"
@@ -54,78 +55,52 @@ namespace ojph {
 
   namespace local
   {
-
-    //////////////////////////////////////////////////////////////////////////
-    static void rotate_buffers(line_buf* line1, line_buf* line2,
-                               line_buf* line3, line_buf* line4)
-    {
-      assert(line1->size == line2->size &&
-             line1->pre_size == line2->pre_size &&
-             line1->size == line3->size &&
-             line1->pre_size == line3->pre_size &&
-             line1->size == line4->size &&
-             line1->pre_size == line4->pre_size);
-      si32* p = line4->i32;
-      line4->i32 = line3->i32;
-      line3->i32 = line2->i32;
-      line2->i32 = line1->i32;
-      line1->i32 = p;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    static void rotate_buffers(line_buf* line1, line_buf* line2,
-                               line_buf* line3, line_buf* line4,
-                               line_buf* line5, line_buf* line6)
-    {
-      assert(line1->size == line2->size &&
-             line1->pre_size == line2->pre_size &&
-             line1->size == line3->size &&
-             line1->pre_size == line3->pre_size &&
-             line1->size == line4->size &&
-             line1->pre_size == line4->pre_size &&
-             line1->size == line5->size &&
-             line1->pre_size == line5->pre_size &&
-             line1->size == line6->size &&
-             line1->pre_size == line6->pre_size);
-      si32* p = line6->i32;
-      line6->i32 = line5->i32;
-      line5->i32 = line4->i32;
-      line4->i32 = line3->i32;
-      line3->i32 = line2->i32;
-      line2->i32 = line1->i32;
-      line1->i32 = p;
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void resolution::pre_alloc(codestream* codestream, const rect& res_rect,
-                               const rect& recon_res_rect, ui32 res_num)
+                               const rect& recon_res_rect, 
+                               ui32 comp_num, ui32 res_num)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
-      const param_cod* cdp = codestream->get_cod();
-      ui32 t = codestream->get_cod()->get_num_decompositions()
-             - codestream->get_skipped_res_for_recon();
+      const param_cod* cdp = codestream->get_coc(comp_num);
+      ui32 num_decomps = cdp->get_num_decompositions();
+      ui32 t = num_decomps - codestream->get_skipped_res_for_recon();
       bool skipped_res_for_recon = res_num > t;
 
-      //create next resolution
+      const param_atk* atk = cdp->access_atk();
+      param_dfs::dfs_dwt_type ds = param_dfs::BIDIR_DWT;
+      if (cdp->is_dfs_defined()) {
+        const param_dfs* dfs = codestream->access_dfs();
+        if (dfs == NULL) {
+          OJPH_ERROR(0x00070001, "There is a problem with codestream "
+            "marker segments. COD/COC specifies the use of a DFS marker "
+            "but there are no DFS markers within the main codestream "
+            "headers");
+        }
+        else {
+          ui16 dfs_idx = cdp->get_dfs_index();
+          dfs = dfs->get_dfs(dfs_idx);
+          if (dfs == NULL) {
+            OJPH_ERROR(0x00070002, "There is a problem with codestream "
+              "marker segments. COD/COC specifies the use of a DFS marker "
+              "with index %d, but there are no such marker within the "
+              "main codestream headers", dfs_idx);
+          }
+          ds = dfs->get_dwt_type(num_decomps - res_num + 1);
+        }
+      }
+
+      ui32 transform_flags = 0;
       if (res_num > 0)
       {
-        //allocate a resolution
-        allocator->pre_alloc_obj<resolution>(1);
-        ui32 trx0 = ojph_div_ceil(res_rect.org.x, 2);
-        ui32 try0 = ojph_div_ceil(res_rect.org.y, 2);
-        ui32 trx1 = ojph_div_ceil(res_rect.org.x + res_rect.siz.w, 2);
-        ui32 try1 = ojph_div_ceil(res_rect.org.y + res_rect.siz.h, 2);
-        rect next_res_rect;
-        next_res_rect.org.x = trx0;
-        next_res_rect.org.y = try0;
-        next_res_rect.siz.w = trx1 - trx0;
-        next_res_rect.siz.h = try1 - try0;
-
-        resolution::pre_alloc(codestream, next_res_rect,
-          skipped_res_for_recon ? recon_res_rect : next_res_rect, res_num - 1);
+        if (ds == param_dfs::BIDIR_DWT)
+          transform_flags = HORZ_TRX | VERT_TRX;
+        else if (ds == param_dfs::HORZ_DWT)
+          transform_flags = HORZ_TRX;
+        else if (ds == param_dfs::VERT_DWT)
+          transform_flags = VERT_TRX;
       }
 
-      //allocate subbands
+      //allocate resolution/subbands
       ui32 trx0 = res_rect.org.x;
       ui32 try0 = res_rect.org.y;
       ui32 trx1 = res_rect.org.x + res_rect.siz.w;
@@ -133,23 +108,83 @@ namespace ojph {
       allocator->pre_alloc_obj<subband>(4);
       if (res_num > 0)
       {
-        for (ui32 i = 1; i < 4; ++i)
+        if (ds == param_dfs::BIDIR_DWT)
+        {
+          for (ui32 i = 0; i < 4; ++i)
+          {
+            ui32 tbx0 = (trx0 - (i & 1) + 1) >> 1;
+            ui32 tbx1 = (trx1 - (i & 1) + 1) >> 1;
+            ui32 tby0 = (try0 - (i >> 1) + 1) >> 1;
+            ui32 tby1 = (try1 - (i >> 1) + 1) >> 1;
+
+            rect re;
+            re.org.x = tbx0;
+            re.org.y = tby0;
+            re.siz.w = tbx1 - tbx0;
+            re.siz.h = tby1 - tby0;
+            if (i == 0) {
+              allocator->pre_alloc_obj<resolution>(1);
+              resolution::pre_alloc(codestream, re,
+                skipped_res_for_recon ? recon_res_rect : re,
+                comp_num, res_num - 1);
+            }
+            else
+              subband::pre_alloc(codestream, re, comp_num, res_num,
+                                 transform_flags);
+          }
+        }
+        else if (ds == param_dfs::VERT_DWT)
         {
-          ui32 tbx0 = (trx0 - (i & 1) + 1) >> 1;
-          ui32 tbx1 = (trx1 - (i & 1) + 1) >> 1;
-          ui32 tby0 = (try0 - (i >> 1) + 1) >> 1;
-          ui32 tby1 = (try1 - (i >> 1) + 1) >> 1;
-
-          rect band_rect;
-          band_rect.org.x = tbx0;
-          band_rect.org.y = tby0;
-          band_rect.siz.w = tbx1 - tbx0;
-          band_rect.siz.h = tby1 - tby0;
-          subband::pre_alloc(codestream, band_rect, res_num);
+          ui32 tby0, tby1;
+          rect re = res_rect;
+          tby0 = (try0 + 1) >> 1;
+          tby1 = (try1 + 1) >> 1;
+          re.org.y = tby0;
+          re.siz.h = tby1 - tby0;
+          allocator->pre_alloc_obj<resolution>(1);
+          resolution::pre_alloc(codestream, re,
+            skipped_res_for_recon ? recon_res_rect : re,
+            comp_num, res_num - 1);
+
+          tby0 = try0 >> 1;
+          tby1 = try1 >> 1;
+          re.org.y = tby0;
+          re.siz.h = tby1 - tby0;
+          subband::pre_alloc(codestream, re, comp_num, res_num, 
+                             transform_flags);
+        }
+        else if (ds == param_dfs::HORZ_DWT)
+        {
+          ui32 tbx0, tbx1;
+          rect re = res_rect;
+          tbx0 = (trx0 + 1) >> 1;
+          tbx1 = (trx1 + 1) >> 1;
+          re.org.x = tbx0;
+          re.siz.w = tbx1 - tbx0;
+          allocator->pre_alloc_obj<resolution>(1);
+          resolution::pre_alloc(codestream, re,
+            skipped_res_for_recon ? recon_res_rect : re,
+            comp_num, res_num - 1);
+
+          tbx0 = trx0 >> 1;
+          tbx1 = trx1 >> 1;
+          re.org.x = tbx0;
+          re.siz.w = tbx1 - tbx0;
+          subband::pre_alloc(codestream, re, comp_num, res_num, 
+                             transform_flags);
+        }
+        else
+        {
+          assert(ds == param_dfs::NO_DWT);
+          allocator->pre_alloc_obj<resolution>(1);
+          resolution::pre_alloc(codestream, res_rect,
+            skipped_res_for_recon ? recon_res_rect : res_rect,
+            comp_num, res_num - 1);
         }
       }
       else
-        subband::pre_alloc(codestream, res_rect, res_num);
+        subband::pre_alloc(codestream, res_rect, comp_num, res_num, 
+                           transform_flags);
 
       //prealloc precincts
       size log_PP = cdp->get_log_precinct_size(res_num);
@@ -160,19 +195,44 @@ namespace ojph {
         num_precincts.w -= trx0 >> log_PP.w;
         num_precincts.h = (try1 + (1 << log_PP.h) - 1) >> log_PP.h;
         num_precincts.h -= try0 >> log_PP.h;
-        allocator->pre_alloc_obj<precinct>(num_precincts.area());
+        allocator->pre_alloc_obj<precinct>((size_t)num_precincts.area());
       }
 
       //allocate lines
       if (skipped_res_for_recon == false)
       {
-        bool reversible = cdp->is_reversible();
-        ui32 num_lines = reversible ? 4 : 6;
-        allocator->pre_alloc_obj<line_buf>(num_lines);
+        ui32 num_steps = atk->get_num_steps();
+        allocator->pre_alloc_obj<line_buf>(num_steps + 2);
+        allocator->pre_alloc_obj<lifting_buf>(num_steps + 2);
+
+        const param_qcd* qp = codestream->access_qcd()->get_qcc(comp_num);
+        ui32 precision = qp->propose_precision(cdp);
+        const param_atk* atk = cdp->access_atk();
+        bool reversible = atk->is_reversible();
 
         ui32 width = res_rect.siz.w + 1;
-        for (ui32 i = 0; i < num_lines; ++i)
-          allocator->pre_alloc_data<si32>(width, 1);
+        if (reversible)
+        {
+          if (precision <= 32) {
+            for (ui32 i = 0; i < num_steps; ++i)
+              allocator->pre_alloc_data<si32>(width, 1);
+            allocator->pre_alloc_data<si32>(width, 1);
+            allocator->pre_alloc_data<si32>(width, 1);
+          }
+          else 
+          {
+            for (ui32 i = 0; i < num_steps; ++i)
+              allocator->pre_alloc_data<si64>(width, 1);
+            allocator->pre_alloc_data<si64>(width, 1);
+            allocator->pre_alloc_data<si64>(width, 1);
+          }
+        }
+        else {
+          for (ui32 i = 0; i < num_steps; ++i)
+            allocator->pre_alloc_data<float>(width, 1);
+          allocator->pre_alloc_data<float>(width, 1);
+          allocator->pre_alloc_data<float>(width, 1);
+        }
       }
     }
 
@@ -181,18 +241,18 @@ namespace ojph {
                                     const rect& res_rect,
                                     const rect& recon_res_rect,
                                     ui32 comp_num, ui32 res_num,
-                                    point comp_downsamp,
+                                    point comp_downsamp, point res_downsamp,
                                     tile_comp* parent_tile_comp,
                                     resolution* parent_res)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
       elastic = codestream->get_elastic_alloc();
-      ui32 t, num_decomps = codestream->get_cod()->get_num_decompositions();
+      const param_cod* cdp = codestream->get_coc(comp_num);
+      ui32 t, num_decomps = cdp->get_num_decompositions();
       t = num_decomps - codestream->get_skipped_res_for_recon();
       skipped_res_for_recon = res_num > t;
       t = num_decomps - codestream->get_skipped_res_for_read();
       skipped_res_for_read = res_num > t;
-      const param_cod* cdp = codestream->get_cod();
 
       this->comp_downsamp = comp_downsamp;
       this->parent_comp = parent_tile_comp;
@@ -201,54 +261,139 @@ namespace ojph {
       this->comp_num = comp_num;
       this->res_num = res_num;
       this->num_bytes = 0;
-      //finalize next resolution
+      this->atk = cdp->access_atk();
+      param_dfs::dfs_dwt_type ds = param_dfs::BIDIR_DWT;
+      if (cdp->is_dfs_defined()) {
+        const param_dfs* dfs = codestream->access_dfs();
+        if (dfs == NULL) {
+          OJPH_ERROR(0x00070011, "There is a problem with codestream "
+            "marker segments. COD/COC specifies the use of a DFS marker "
+            "but there are no DFS markers within the main codestream "
+            "headers");
+        }
+        else {
+          ui16 dfs_idx = cdp->get_dfs_index();
+          dfs = dfs->get_dfs(dfs_idx);
+          if (dfs == NULL) {
+            OJPH_ERROR(0x00070012, "There is a problem with codestream "
+              "marker segments. COD/COC specifies the use of a DFS marker "
+              "with index %d, but there are no such marker within the "
+              "main codestream headers", dfs_idx);
+          }
+          ui32 num_decomps = cdp->get_num_decompositions();
+          ds = dfs->get_dwt_type(num_decomps - res_num + 1);
+        }
+      }
+
+      transform_flags = 0;
       if (res_num > 0)
       {
-        //allocate a resolution
-        child_res = allocator->post_alloc_obj<resolution>(1);
-        ui32 trx0 = ojph_div_ceil(res_rect.org.x, 2);
-        ui32 try0 = ojph_div_ceil(res_rect.org.y, 2);
-        ui32 trx1 = ojph_div_ceil(res_rect.org.x + res_rect.siz.w, 2);
-        ui32 try1 = ojph_div_ceil(res_rect.org.y + res_rect.siz.h, 2);
-        rect next_res_rect;
-        next_res_rect.org.x = trx0;
-        next_res_rect.org.y = try0;
-        next_res_rect.siz.w = trx1 - trx0;
-        next_res_rect.siz.h = try1 - try0;
-
-        child_res->finalize_alloc(codestream, next_res_rect,
-          skipped_res_for_recon ? recon_res_rect : next_res_rect, comp_num,
-          res_num - 1, comp_downsamp, parent_tile_comp, this);
+        if (ds == param_dfs::BIDIR_DWT)
+          transform_flags = HORZ_TRX | VERT_TRX;
+        else if (ds == param_dfs::HORZ_DWT)
+          transform_flags = HORZ_TRX;
+        else if (ds == param_dfs::VERT_DWT)
+          transform_flags = VERT_TRX;
       }
-      else
-        child_res = NULL;
 
-      //allocate subbands
+      //allocate resolution/subbands
       ui32 trx0 = res_rect.org.x;
       ui32 try0 = res_rect.org.y;
       ui32 trx1 = res_rect.org.x + res_rect.siz.w;
       ui32 try1 = res_rect.org.y + res_rect.siz.h;
       bands = allocator->post_alloc_obj<subband>(4);
+      for (int i = 0; i < 4; ++i)
+        new (bands + i) subband;
       if (res_num > 0)
       {
-        this->num_bands = 3;
-        for (ui32 i = 1; i < 4; ++i)
+        if (ds == param_dfs::BIDIR_DWT)
+        {
+          for (ui32 i = 0; i < 4; ++i)
+          {
+            ui32 tbx0 = (trx0 - (i & 1) + 1) >> 1;
+            ui32 tbx1 = (trx1 - (i & 1) + 1) >> 1;
+            ui32 tby0 = (try0 - (i >> 1) + 1) >> 1;
+            ui32 tby1 = (try1 - (i >> 1) + 1) >> 1;
+
+            rect re;
+            re.org.x = tbx0;
+            re.org.y = tby0;
+            re.siz.w = tbx1 - tbx0;
+            re.siz.h = tby1 - tby0;
+            if (i == 0) {
+              point next_res_downsamp;
+              next_res_downsamp.x = res_downsamp.x * 2;
+              next_res_downsamp.y = res_downsamp.y * 2;
+
+              child_res = allocator->post_alloc_obj<resolution>(1);
+              child_res->finalize_alloc(codestream, re,
+                skipped_res_for_recon ? recon_res_rect : re, comp_num,
+                res_num - 1, comp_downsamp, next_res_downsamp, 
+                parent_tile_comp, this);
+            }
+            else
+              bands[i].finalize_alloc(codestream, re, this, res_num, i);
+          }
+        }
+        else if (ds == param_dfs::VERT_DWT)
         {
-          ui32 tbx0 = (trx0 - (i & 1) + 1) >> 1;
-          ui32 tbx1 = (trx1 - (i & 1) + 1) >> 1;
-          ui32 tby0 = (try0 - (i >> 1) + 1) >> 1;
-          ui32 tby1 = (try1 - (i >> 1) + 1) >> 1;
-
-          rect band_rect;
-          band_rect.org.x = tbx0;
-          band_rect.org.y = tby0;
-          band_rect.siz.w = tbx1 - tbx0;
-          band_rect.siz.h = tby1 - tby0;
-          bands[i].finalize_alloc(codestream, band_rect, this, res_num, i);
+          ui32 tby0, tby1;
+          rect re = res_rect;
+          tby0 = (try0 + 1) >> 1;
+          tby1 = (try1 + 1) >> 1;
+          re.org.y = tby0;
+          re.siz.h = tby1 - tby0;
+
+          point next_res_downsamp;
+          next_res_downsamp.x = res_downsamp.x;
+          next_res_downsamp.y = res_downsamp.y * 2;
+          child_res = allocator->post_alloc_obj<resolution>(1);
+          child_res->finalize_alloc(codestream, re,
+            skipped_res_for_recon ? recon_res_rect : re, comp_num,
+            res_num - 1, comp_downsamp, next_res_downsamp,
+            parent_tile_comp, this);
+
+          tby0 = try0 >> 1;
+          tby1 = try1 >> 1;
+          re.org.y = tby0;
+          re.siz.h = tby1 - tby0;
+          bands[2].finalize_alloc(codestream, re, this, res_num, 2);
+        }
+        else if (ds == param_dfs::HORZ_DWT)
+        {
+          ui32 tbx0, tbx1;
+          rect re = res_rect;
+          tbx0 = (trx0 + 1) >> 1;
+          tbx1 = (trx1 + 1) >> 1;
+          re.org.x = tbx0;
+          re.siz.w = tbx1 - tbx0;
+
+          point next_res_downsamp;
+          next_res_downsamp.x = res_downsamp.x * 2;
+          next_res_downsamp.y = res_downsamp.y;
+          child_res = allocator->post_alloc_obj<resolution>(1);
+          child_res->finalize_alloc(codestream, re,
+            skipped_res_for_recon ? recon_res_rect : re, comp_num,
+            res_num - 1, comp_downsamp, next_res_downsamp,
+            parent_tile_comp, this);
+
+          tbx0 = trx0 >> 1;
+          tbx1 = trx1 >> 1;
+          re.org.x = tbx0;
+          re.siz.w = tbx1 - tbx0;
+          bands[1].finalize_alloc(codestream, re, this, res_num, 1);
+        }
+        else
+        {
+          assert(ds == param_dfs::NO_DWT);
+          child_res = allocator->post_alloc_obj<resolution>(1);
+          child_res->finalize_alloc(codestream, res_rect,
+            skipped_res_for_recon ? recon_res_rect : res_rect, comp_num,
+            res_num - 1, comp_downsamp, res_downsamp, parent_tile_comp, this);
         }
       }
       else {
-        this->num_bands = 1;
+        child_res = NULL;
         bands[0].finalize_alloc(codestream, res_rect, this, res_num, 0);
       }
 
@@ -262,7 +407,8 @@ namespace ojph {
         num_precincts.w -= trx0 >> log_PP.w;
         num_precincts.h = (try1 + (1 << log_PP.h) - 1) >> log_PP.h;
         num_precincts.h -= try0 >> log_PP.h;
-        precincts = allocator->post_alloc_obj<precinct>(num_precincts.area());
+        precincts = 
+          allocator->post_alloc_obj<precinct>((size_t)num_precincts.area());
         ui64 num = num_precincts.area();
         for (ui64 i = 0; i < num; ++i)
           precincts[i] = precinct();
@@ -272,11 +418,7 @@ namespace ojph {
       ui32 x_lower_bound = (trx0 >> log_PP.w) << log_PP.w;
       ui32 y_lower_bound = (try0 >> log_PP.h) << log_PP.h;
 
-      point proj_factor;
-      proj_factor.x = comp_downsamp.x * (1 << (num_decomps - res_num));
-      proj_factor.y = comp_downsamp.y * (1 << (num_decomps - res_num));
       precinct* pp = precincts;
-
       point tile_top_left = parent_tile_comp->get_tile()->get_tile_rect().org;
       for (ui32 y = 0; y < num_precincts.h; ++y)
       {
@@ -284,11 +426,10 @@ namespace ojph {
         for (ui32 x = 0; x < num_precincts.w; ++x, ++pp)
         {
           ui32 ppx0 = x_lower_bound + (x << log_PP.w);
-          point t(proj_factor.x * ppx0, proj_factor.y * ppy0);
+          point t(res_downsamp.x * ppx0, res_downsamp.y * ppy0);
           t.x = t.x > tile_top_left.x ? t.x : tile_top_left.x;
           t.y = t.y > tile_top_left.y ? t.y : tile_top_left.y;
           pp->img_point = t;
-          pp->num_bands = num_bands;
           pp->bands = bands;
           pp->may_use_sop = cdp->packets_may_use_sop();
           pp->uses_eph = cdp->packets_use_eph();
@@ -296,15 +437,15 @@ namespace ojph {
           pp->coded = NULL;
         }
       }
-      if (num_bands == 1)
-        bands[0].get_cb_indices(num_precincts, precincts);
-      else
-        for (int i = 1; i < 4; ++i)
+      for (int i = 0; i < 4; ++i)
+        if (bands[i].exists())
           bands[i].get_cb_indices(num_precincts, precincts);
 
+      // determine how to divide scratch into multiple levels of
+      // tag trees
       size log_cb = cdp->get_log_block_dims();
-      log_PP.w -= (res_num ? 1 : 0);
-      log_PP.h -= (res_num ? 1 : 0);
+      log_PP.w -= (transform_flags & HORZ_TRX) ? 1 : 0;
+      log_PP.h -= (transform_flags & VERT_TRX) ? 1 : 0;
       size ratio;
       ratio.w = log_PP.w - ojph_min(log_cb.w, log_PP.w);
       ratio.h = log_PP.h - ojph_min(log_cb.h, log_PP.h);
@@ -320,26 +461,95 @@ namespace ojph {
       //allocate lines
       if (skipped_res_for_recon == false)
       {
-        this->reversible = cdp->is_reversible();
-        this->num_lines = this->reversible ? 4 : 6;
-        lines = allocator->post_alloc_obj<line_buf>(num_lines);
+        this->atk = cdp->access_atk();
+        this->reversible = atk->is_reversible();
+        this->num_steps = atk->get_num_steps();
+        // create line buffers and lifting_bufs
+        lines = allocator->post_alloc_obj<line_buf>(num_steps + 2);
+        ssp = allocator->post_alloc_obj<lifting_buf>(num_steps + 2);
+        sig = ssp + num_steps;
+        aug = ssp + num_steps + 1;
+
+        // initiate lifting_bufs
+        for (ui32 i = 0; i < num_steps; ++i) {
+          new (ssp + i) lifting_buf;
+          ssp[i].line = lines + i;
+        };
+        new (sig) lifting_buf;
+        sig->line = lines + num_steps;
+        new (aug) lifting_buf;
+        aug->line = lines + num_steps + 1;
+
+        const param_qcd* qp = codestream->access_qcd()->get_qcc(comp_num);
+        ui32 precision = qp->propose_precision(cdp);
 
+        // initiate storage of line_buf
         ui32 width = res_rect.siz.w + 1;
-        for (ui32 i = 0; i < num_lines; ++i)
-          lines[i].wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+        if (this->reversible)
+        {
+          if (precision <= 32)
+          {
+            for (ui32 i = 0; i < num_steps; ++i)
+              ssp[i].line->wrap(
+                allocator->post_alloc_data<si32>(width, 1), width, 1);
+            sig->line->wrap(
+              allocator->post_alloc_data<si32>(width, 1), width, 1);
+            aug->line->wrap(
+              allocator->post_alloc_data<si32>(width, 1), width, 1);
+          }
+          else
+          {
+            for (ui32 i = 0; i < num_steps; ++i)
+              ssp[i].line->wrap(
+                allocator->post_alloc_data<si64>(width, 1), width, 1);
+            sig->line->wrap(
+              allocator->post_alloc_data<si64>(width, 1), width, 1);
+            aug->line->wrap(
+              allocator->post_alloc_data<si64>(width, 1), width, 1);
+          }
+        }
+        else 
+        {
+            for (ui32 i = 0; i < num_steps; ++i)
+              ssp[i].line->wrap(
+                allocator->post_alloc_data<float>(width, 1), width, 1);
+            sig->line->wrap(
+              allocator->post_alloc_data<float>(width, 1), width, 1);
+            aug->line->wrap(
+              allocator->post_alloc_data<float>(width, 1), width, 1);
+        }
+
         cur_line = 0;
+        rows_to_produce = res_rect.siz.h;
         vert_even = (res_rect.org.y & 1) == 0;
         horz_even = (res_rect.org.x & 1) == 0;
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    line_buf* resolution::get_line()
+    { 
+      if (vert_even)
+      {
+        ++cur_line;
+        sig->active = true;
+        return sig->line;
+      }
+      else
+      {
+        ++cur_line;
+        aug->active = true;
+        return aug->line;
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void resolution::push_line()
     {
       if (res_num == 0)
       {
-        assert(num_bands == 1 && child_res == NULL);
-        bands[0].exchange_buf(lines + 0);//line at location 0
+        assert(child_res == NULL);
+        bands[0].exchange_buf(vert_even ? sig->line : aug->line);
         bands[0].push_line();
         return;
       }
@@ -349,255 +559,153 @@ namespace ojph {
         return;
       if (reversible)
       {
-        //vertical transform
-        assert(num_lines >= 4);
-        if (vert_even)
+        if (res_rect.siz.h > 1)
         {
-          rev_vert_wvlt_fwd_predict(lines,
-                                    cur_line > 1 ? lines + 2 : lines,
-                                    lines + 1, width);
-          rev_vert_wvlt_fwd_update(lines + 1,
-                                   cur_line > 2 ? lines + 3 : lines + 1,
-                                   lines + 2, width);
-
-          // push to horizontal transform lines[2](L) and lines[1] (H)
-          if (cur_line >= 1)
-          {
-            rev_horz_wvlt_fwd_tx(lines + 1, bands[2].get_line(),
-              bands[3].get_line(), width, horz_even);
-            bands[2].push_line();
-            bands[3].push_line();
-          }
-          if (cur_line >= 2)
-          {
-            rev_horz_wvlt_fwd_tx(lines + 2, child_res->get_line(),
-              bands[1].get_line(), width, horz_even);
-            bands[1].push_line();
-            child_res->push_line();
+          if (!vert_even && cur_line < res_rect.siz.h) {
+            vert_even = !vert_even;
+            return;
           }
-        }
 
-        if (cur_line >= res_rect.siz.h - 1)
-        { //finished, so we need to process any lines left
-          if (cur_line)
+          do
           {
-            if (vert_even)
+            //vertical transform
+            for (ui32 i = 0; i < num_steps; ++i)
             {
-              rev_vert_wvlt_fwd_update(lines + 1, lines + 1,
-                                       lines, width);
-              //push lines[0] to L
-              rev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
-              bands[1].push_line();
-              child_res->push_line();
+              if (aug->active && (sig->active || ssp[i].active))
+              {
+                line_buf* dp = aug->line;
+                line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
+                line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
+                const lifting_step* s = atk->get_step(num_steps - i - 1);
+                rev_vert_step(s, sp1, sp2, dp, width, false);
+              }
+              lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
             }
-            else
-            {
-              rev_vert_wvlt_fwd_predict(lines + 1, lines + 1,
-                                        lines, width);
-              rev_vert_wvlt_fwd_update(lines,
-                                       cur_line > 1 ? lines + 2 : lines,
-                                       lines + 1, width);
-
-              // push to horizontal transform lines[1](L) and line[0] (H)
-              //line[0] to H
-              rev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
+
+            if (aug->active) {
+              rev_horz_ana(atk, bands[2].get_line(),
+                bands[3].get_line(), aug->line, width, horz_even);
               bands[2].push_line();
               bands[3].push_line();
-              //line[1] to L
-              rev_horz_wvlt_fwd_tx(lines + 1, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
+              aug->active = false;
+              --rows_to_produce;
+            }
+            if (sig->active) {
+              rev_horz_ana(atk, child_res->get_line(),
+                bands[1].get_line(), sig->line, width, horz_even);
               bands[1].push_line();
               child_res->push_line();
-            }
+              sig->active = false;
+              --rows_to_produce;
+            };
+            vert_even = !vert_even;
+          } while (cur_line >= res_rect.siz.h && rows_to_produce > 0);
+        }
+        else
+        {
+          if (vert_even) {
+            // horizontal transform
+            rev_horz_ana(atk, child_res->get_line(),
+              bands[1].get_line(), sig->line, width, horz_even);
+            bands[1].push_line();
+            child_res->push_line();
           }
           else
-          { //only one line
-            if (vert_even)
+          {
+            // vertical transform
+            if (aug->line->flags & line_buf::LFT_32BIT)
             {
-              //push to L
-              rev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
-              bands[1].push_line();
-              child_res->push_line();
+              si32* sp = aug->line->i32;
+              for (ui32 i = width; i > 0; --i)
+                *sp++ <<= 1;
             }
             else
             {
-              si32* sp = lines[0].i32;
+              assert(aug->line->flags & line_buf::LFT_64BIT);
+              si64* sp = aug->line->i64;
               for (ui32 i = width; i > 0; --i)
                 *sp++ <<= 1;
-              //push to H
-              rev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
-              bands[2].push_line();
-              bands[3].push_line();
             }
+            // horizontal transform
+            rev_horz_ana(atk, bands[2].get_line(),
+              bands[3].get_line(), aug->line, width, horz_even);
+            bands[2].push_line();
+            bands[3].push_line();
           }
         }
-
-        rotate_buffers(lines, lines + 1, lines + 2, lines + 3);
-
-        ++cur_line;
-        vert_even = !vert_even;
       }
       else
       {
-        //vertical transform
-        assert(num_lines >= 6);
-        if (vert_even)
+        if (res_rect.siz.h > 1)
         {
-          irrev_vert_wvlt_step(lines + 0,
-                               cur_line > 1 ? lines + 2 : lines,
-                               lines + 1, 0, width);
-          irrev_vert_wvlt_step(lines + 1,
-                               cur_line > 2 ? lines + 3 : lines + 1,
-                               lines + 2, 1, width);
-          irrev_vert_wvlt_step(lines + 2,
-                               cur_line > 3 ? lines + 4 : lines + 2,
-                               lines + 3, 2, width);
-          irrev_vert_wvlt_step(lines + 3,
-                               cur_line > 4 ? lines + 5 : lines + 3,
-                               lines + 4, 3, width);
-
-          // push to horizontal transform lines[4](L) and lines[3] (H)
-          if (cur_line >= 3)
-          {
-            irrev_vert_wvlt_K(lines + 3, lines + 5,
-                              false, width);
-            irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-              bands[3].get_line(), width, horz_even);
-            bands[2].push_line();
-            bands[3].push_line();
-          }
-          if (cur_line >= 4)
-          {
-            irrev_vert_wvlt_K(lines + 4, lines + 5,
-                              true, width);
-            irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
-              bands[1].get_line(), width, horz_even);
-            bands[1].push_line();
-            child_res->push_line();
+          if (!vert_even && cur_line < res_rect.siz.h) {
+            vert_even = !vert_even;
+            return;
           }
-        }
 
-        if (cur_line >= res_rect.siz.h - 1)
-        { //finished, so we need to process any left line
-          if (cur_line)
+          do
           {
-            if (vert_even)
+            //vertical transform
+            for (ui32 i = 0; i < num_steps; ++i)
             {
-              irrev_vert_wvlt_step(lines + 1, lines + 1,
-                                   lines, 1, width);
-              irrev_vert_wvlt_step(lines,
-                                   cur_line > 1 ? lines + 2 : lines,
-                                   lines + 1, 2, width);
-              irrev_vert_wvlt_step(lines + 1,
-                                   cur_line > 2 ? lines + 3 : lines + 1,
-                                   lines + 2, 3, width);
-              irrev_vert_wvlt_step(lines + 1, lines + 1,
-                                   lines, 3, width);
-              //push lines[2] to L, lines[1] to H, and lines[0] to L
-              if (cur_line >= 2)
+              if (aug->active && (sig->active || ssp[i].active))
               {
-                irrev_vert_wvlt_K(lines + 2, lines + 5,
-                                  true, width);
-                irrev_horz_wvlt_fwd_tx(lines + 5,
-                  child_res->get_line(), bands[1].get_line(),
-                  width, horz_even);
-                bands[1].push_line();
-                child_res->push_line();
+                line_buf* dp = aug->line;
+                line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
+                line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
+                const lifting_step* s = atk->get_step(num_steps - i - 1);
+                irv_vert_step(s, sp1, sp2, dp, width, false);
               }
-              irrev_vert_wvlt_K(lines + 1, lines + 5,
-                                false, width);
-              irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
-              bands[2].push_line();
-              bands[3].push_line();
-              irrev_vert_wvlt_K(lines, lines + 5,
-                                true, width);
-              irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
-              bands[1].push_line();
-              child_res->push_line();
+              lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
             }
-            else
-            {
-              irrev_vert_wvlt_step(lines + 1, lines + 1,
-                                   lines, 0, width);
-              irrev_vert_wvlt_step(lines,
-                                   cur_line > 1 ? lines + 2 : lines,
-                                   lines + 1, 1, width);
-              irrev_vert_wvlt_step(lines + 1,
-                                   cur_line > 2 ? lines + 3 : lines + 1,
-                                   lines + 2, 2, width);
-              irrev_vert_wvlt_step(lines + 2,
-                                   cur_line > 3 ? lines + 4 : lines + 2,
-                                   lines + 3, 3, width);
-
-              irrev_vert_wvlt_step(lines + 1, lines + 1,
-                                   lines, 2, width);
-              irrev_vert_wvlt_step(lines,
-                                   cur_line > 1 ? lines + 2 : lines,
-                                   lines + 1, 3, width);
-
-              //push lines[3] L, lines[2] H, lines[1] L, and lines[0] H
-              if (cur_line >= 3)
-              {
-                irrev_vert_wvlt_K(lines + 3, lines + 5,
-                                  true, width);
-                irrev_horz_wvlt_fwd_tx(lines + 5,
-                  child_res->get_line(), bands[1].get_line(),
-                  width, horz_even);
-                bands[1].push_line();
-                child_res->push_line();
-              }
-              irrev_vert_wvlt_K(lines + 2, lines + 5,
-                                false, width);
-              irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
+
+            if (aug->active) {
+              const float K = atk->get_K();
+              irv_vert_times_K(K, aug->line, width);
+
+              irv_horz_ana(atk, bands[2].get_line(),
+                bands[3].get_line(), aug->line, width, horz_even);
               bands[2].push_line();
               bands[3].push_line();
-              irrev_vert_wvlt_K(lines + 1, lines + 5,
-                                true, width);
-              irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
+              aug->active = false;
+              --rows_to_produce;
+            }
+            if (sig->active) {
+              const float K_inv = 1.0f / atk->get_K();
+              irv_vert_times_K(K_inv, sig->line, width);
+
+              irv_horz_ana(atk, child_res->get_line(),
+                bands[1].get_line(), sig->line, width, horz_even);
               bands[1].push_line();
               child_res->push_line();
-              irrev_vert_wvlt_K(lines, lines + 5,
-                                false, width);
-              irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
-              bands[2].push_line();
-              bands[3].push_line();
-            }
+              sig->active = false;
+              --rows_to_produce;
+            };
+            vert_even = !vert_even;
+          } while (cur_line >= res_rect.siz.h && rows_to_produce > 0);
+        }
+        else
+        {
+          if (vert_even) {
+            // horizontal transform
+            irv_horz_ana(atk, child_res->get_line(),
+              bands[1].get_line(), sig->line, width, horz_even);
+            bands[1].push_line();
+            child_res->push_line();
           }
           else
-          { //only one line
-            if (vert_even)
-            {
-              //push to L
-              irrev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
-              bands[1].push_line();
-              child_res->push_line();
-            }
-            else
-            {
-              //push to H
-              irrev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
-              bands[2].push_line();
-              bands[3].push_line();
-            }
+          {
+            // vertical transform
+            float* sp = aug->line->f32;
+            for (ui32 i = width; i > 0; --i)
+              *sp++ *= 2.0f;
+            // horizontal transform
+            irv_horz_ana(atk, bands[2].get_line(),
+              bands[3].get_line(), aug->line, width, horz_even);
+            bands[2].push_line();
+            bands[3].push_line();
           }
         }
-
-        rotate_buffers(lines, lines + 1, lines + 2, lines + 3, lines + 4, 
-                       lines + 5);
-
-        ++cur_line;
-        vert_even = !vert_even;
       }
     }
 
@@ -606,7 +714,7 @@ namespace ojph {
     {
       if (res_num == 0)
       {
-        assert(num_bands == 1 && child_res == NULL);
+        assert(child_res == NULL);
         return bands[0].pull_line();
       }
 
@@ -615,147 +723,228 @@ namespace ojph {
 
       ui32 width = res_rect.siz.w;
       if (width == 0)
-        return lines;
-      if (reversible)
+        return NULL;
+
+      if (transform_flags & VERT_TRX)
       {
-        assert(num_lines >= 4);
-        if (res_rect.siz.h > 1)
+        if (reversible)
         {
-          do
+          if (res_rect.siz.h > 1)
           {
-            //horizontal transform
-            if (cur_line < res_rect.siz.h)
+            if (sig->active) {
+              sig->active = false;
+              return sig->line;
+            };
+            for (;;)
             {
-              if (vert_even)
-                rev_horz_wvlt_bwd_tx(lines,
-                  child_res->pull_line(), bands[1].pull_line(),
-                  width, horz_even);
-              else
-                rev_horz_wvlt_bwd_tx(lines,
-                  bands[2].pull_line(), bands[3].pull_line(),
-                  width, horz_even);
-            }
+              //horizontal transform
+              if (cur_line < res_rect.siz.h)
+              {
+                if (vert_even) { // even
+                  if (transform_flags & HORZ_TRX)
+                    rev_horz_syn(atk, aug->line, child_res->pull_line(), 
+                      bands[1].pull_line(), width, horz_even);
+                  else
+                    memcpy(aug->line->p, child_res->pull_line()->p,
+                      (size_t)width 
+                      * (aug->line->flags & line_buf::LFT_SIZE_MASK));
+                  aug->active = true;
+                  vert_even = !vert_even;
+                  ++cur_line;
+                  continue;
+                }
+                else {
+                  if (transform_flags & HORZ_TRX)
+                    rev_horz_syn(atk, sig->line, bands[2].pull_line(), 
+                      bands[3].pull_line(), width, horz_even);
+                  else
+                    memcpy(sig->line->p, bands[2].pull_line()->p,
+                      (size_t)width 
+                      * (sig->line->flags & line_buf::LFT_SIZE_MASK));
+                  sig->active = true;
+                  vert_even = !vert_even;
+                  ++cur_line;
+                }
+              }
 
-            //vertical transform
-            if (!vert_even)
-            {
-              rev_vert_wvlt_bwd_update(
-                cur_line > 1 ? lines + 2 : lines,
-                cur_line < res_rect.siz.h ? lines : lines + 2,
-                lines + 1, width);
-              rev_vert_wvlt_bwd_predict(
-                cur_line > 2 ? lines + 3 : lines + 1,
-                cur_line < res_rect.siz.h + 1 ? lines + 1 : lines + 3,
-                lines + 2, width);
-            }
+              //vertical transform
+              for (ui32 i = 0; i < num_steps; ++i)
+              {
+                if (aug->active && (sig->active || ssp[i].active))
+                {
+                  line_buf* dp = aug->line;
+                  line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
+                  line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
+                  const lifting_step* s = atk->get_step(i);
+                  rev_vert_step(s, sp1, sp2, dp, width, true);
+                }
+                lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
+              }
 
-            vert_even = !vert_even;
-            rotate_buffers(lines, lines + 1, lines + 2, lines + 3);
-            ++cur_line;
-          } while (cur_line < 3);
-          memcpy(lines[0].i32, lines[3].i32, res_rect.siz.w * sizeof(si32));
-          return lines;
-        }
-        else if (res_rect.siz.h == 1)
-        {
-          if (vert_even)
-          {
-            rev_horz_wvlt_bwd_tx(lines, child_res->pull_line(),
-              bands[1].pull_line(), width, horz_even);
+              if (aug->active) {
+                aug->active = false;
+                return aug->line;
+              }
+              if (sig->active) {
+                sig->active = false;
+                return sig->line;
+              };
+            }
           }
           else
           {
-            rev_horz_wvlt_bwd_tx(lines, bands[2].pull_line(),
-              bands[3].pull_line(), width, horz_even);
-            if (width)
+            if (vert_even) {
+              if (transform_flags & HORZ_TRX)
+                rev_horz_syn(atk, aug->line, child_res->pull_line(),
+                  bands[1].pull_line(), width, horz_even);
+              else
+                memcpy(aug->line->p, child_res->pull_line()->p,
+                  (size_t)width 
+                  * (aug->line->flags & line_buf::LFT_SIZE_MASK));
+            }
+            else
             {
-              si32* sp = lines[0].i32;
-              for (ui32 i = width; i > 0; --i)
-                *sp++ >>= 1;
+              if (transform_flags & HORZ_TRX)
+                rev_horz_syn(atk, aug->line, bands[2].pull_line(),
+                  bands[3].pull_line(), width, horz_even);
+              else
+                memcpy(aug->line->p, bands[2].pull_line()->p,
+                  (size_t)width 
+                  * (aug->line->flags & line_buf::LFT_SIZE_MASK));
+              if (aug->line->flags & line_buf::LFT_32BIT)
+              {
+                si32* sp = aug->line->i32;                
+                for (ui32 i = width; i > 0; --i)
+                  *sp++ >>= 1;
+              }
+              else
+              {
+                assert(aug->line->flags & line_buf::LFT_64BIT);
+                si64* sp = aug->line->i64;
+                for (ui32 i = width; i > 0; --i)
+                  *sp++ >>= 1;
+              }
             }
+            return aug->line;
           }
-          return lines;
         }
         else
-          return lines;
-      }
-      else
-      {
-        assert(num_lines >= 6);
-        if (res_rect.siz.h > 1)
         {
-          do
+          if (res_rect.siz.h > 1)
           {
-            //horizontal transform
-            if (cur_line < res_rect.siz.h)
+            if (sig->active) {
+              sig->active = false;
+              return sig->line;
+            };
+            for (;;)
             {
-              if (vert_even)
+              //horizontal transform
+              if (cur_line < res_rect.siz.h)
               {
-                irrev_horz_wvlt_bwd_tx(lines,
-                  child_res->pull_line(), bands[1].pull_line(),
-                  width, horz_even);
-                irrev_vert_wvlt_K(lines, lines, false, width);
+                if (vert_even) { // even
+                  if (transform_flags & HORZ_TRX)
+                    irv_horz_syn(atk, aug->line, child_res->pull_line(), 
+                      bands[1].pull_line(), width, horz_even);
+                  else 
+                    memcpy(aug->line->f32, child_res->pull_line()->f32,
+                      width * sizeof(float));
+                  aug->active = true;
+                  vert_even = !vert_even;
+                  ++cur_line;
+
+                  const float K = atk->get_K();
+                  irv_vert_times_K(K, aug->line, width);
+
+                  continue;
+                }
+                else {
+                  if (transform_flags & HORZ_TRX)
+                    irv_horz_syn(atk, sig->line, bands[2].pull_line(), 
+                      bands[3].pull_line(), width, horz_even);
+                  else
+                    memcpy(sig->line->f32, bands[2].pull_line()->f32,
+                      width * sizeof(float));
+                  sig->active = true;
+                  vert_even = !vert_even;
+                  ++cur_line;
+
+                  const float K_inv = 1.0f / atk->get_K();
+                  irv_vert_times_K(K_inv, sig->line, width);
+                }
               }
-              else
+
+              //vertical transform
+              for (ui32 i = 0; i < num_steps; ++i)
               {
-                irrev_horz_wvlt_bwd_tx(lines,
-                  bands[2].pull_line(), bands[3].pull_line(),
-                  width, horz_even);
-                irrev_vert_wvlt_K(lines, lines, true, width);
+                if (aug->active && (sig->active || ssp[i].active))
+                {
+                  line_buf* dp = aug->line;
+                  line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
+                  line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
+                  const lifting_step* s = atk->get_step(i);
+                  irv_vert_step(s, sp1, sp2, dp, width, true);
+                }
+                lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
               }
-            }
 
-            //vertical transform
-            if (!vert_even)
-            {
-              irrev_vert_wvlt_step(
-                cur_line > 1 ? lines + 2 : lines,
-                cur_line < res_rect.siz.h ? lines : lines + 2,
-                lines + 1, 7, width);
-              irrev_vert_wvlt_step(
-                cur_line > 2 ? lines + 3 : lines + 1,
-                cur_line < res_rect.siz.h + 1 ? lines + 1 : lines + 3,
-                lines + 2, 6, width);
-              irrev_vert_wvlt_step(
-                cur_line > 3 ? lines + 4 : lines + 2,
-                cur_line < res_rect.siz.h + 2 ? lines + 2 : lines + 4,
-                lines + 3, 5, width);
-              irrev_vert_wvlt_step(
-                cur_line > 4 ? lines + 5 : lines + 3,
-                cur_line < res_rect.siz.h + 3 ? lines + 3 : lines + 5,
-                lines + 4, 4, width);
+              if (aug->active) {
+                aug->active = false;
+                return aug->line;
+              }
+              if (sig->active) {
+                sig->active = false;
+                return sig->line;
+              };
             }
-
-            vert_even = !vert_even;
-            rotate_buffers(lines, lines + 1, lines + 2, lines + 3, lines + 4, 
-                           lines + 5);
-            ++cur_line;
-          } while (cur_line < 5);
-          memcpy(lines[0].f32, lines[5].f32, res_rect.siz.w * sizeof(float));
-          return lines;
-        }
-        else if (res_rect.siz.h == 1)
-        {
-          if (vert_even)
-          {
-            irrev_horz_wvlt_bwd_tx(lines, child_res->pull_line(),
-              bands[1].pull_line(), width, horz_even);
           }
           else
           {
-            irrev_horz_wvlt_bwd_tx(lines, bands[2].pull_line(),
-              bands[3].pull_line(), width, horz_even);
-            if (width)
+            if (vert_even) {
+              if (transform_flags & HORZ_TRX)
+                irv_horz_syn(atk, aug->line, child_res->pull_line(),
+                  bands[1].pull_line(), width, horz_even);
+              else
+                memcpy(aug->line->f32, child_res->pull_line()->f32,
+                  width * sizeof(float));
+            }
+            else
             {
-              float* sp = lines[0].f32;
+              if (transform_flags & HORZ_TRX)
+                irv_horz_syn(atk, aug->line, bands[2].pull_line(),
+                  bands[3].pull_line(), width, horz_even);
+             else
+                memcpy(aug->line->f32, bands[2].pull_line()->f32,
+                  width * sizeof(float));
+              float* sp = aug->line->f32;
               for (ui32 i = width; i > 0; --i)
                 *sp++ *= 0.5f;
             }
+            return aug->line;
           }
-          return lines;
+        }
+      }
+      else
+      { 
+        if (reversible)
+        {
+          if (transform_flags & HORZ_TRX)
+            rev_horz_syn(atk, aug->line, child_res->pull_line(),
+              bands[1].pull_line(), width, horz_even);
+          else
+            memcpy(aug->line->p, child_res->pull_line()->p,
+              (size_t)width * (aug->line->flags & line_buf::LFT_SIZE_MASK));
+          return aug->line;
         }
         else
-          return lines;
+        {
+          if (transform_flags & HORZ_TRX)
+            irv_horz_syn(atk, aug->line, child_res->pull_line(),
+              bands[1].pull_line(), width, horz_even);
+          else
+            memcpy(aug->line->f32, child_res->pull_line()->f32,
+              width * sizeof(float));
+          return aug->line;
+        }
       }
     }
 
@@ -850,13 +1039,11 @@ namespace ojph {
     {
       if (this->res_num == resolution_num)
         return get_num_bytes();
-      else {
-        if (child_res)
-          return child_res->get_num_bytes(resolution_num);
-        else
-          return 0;
+      if (resolution_num < this->res_num) {
+        assert(child_res);
+        return child_res->get_num_bytes(resolution_num);
       }
-
+      return 0;
     }
   }
 }
\ No newline at end of file
diff --git a/src/core/codestream/ojph_resolution.h b/src/core/codestream/ojph_resolution.h
index e110811b..61564557 100644
--- a/src/core/codestream/ojph_resolution.h
+++ b/src/core/codestream/ojph_resolution.h
@@ -45,7 +45,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_elastic_allocator;
   class codestream;
 
@@ -61,21 +61,28 @@ namespace ojph {
     class resolution
     {
     public:
+      enum : ui32 {
+        HORZ_TRX = 0x01,   // horizontal transform
+        VERT_TRX = 0x02,   // vertical transform
+      };
 
     public:
       static void pre_alloc(codestream *codestream, const rect& res_rect,
-                            const rect& recon_res_rect, ui32 res_num);
+                            const rect& recon_res_rect, 
+                            ui32 comp_num, ui32 res_num);
       void finalize_alloc(codestream *codestream, const rect& res_rect,
                           const rect& recon_res_rect, ui32 comp_num,
-                          ui32 res_num, point comp_downsamp,
-                          tile_comp *parent_tile_comp,
+                          ui32 res_num, point comp_downsamp, 
+                          point res_downsamp, tile_comp *parent_tile_comp,
                           resolution *parent_res);
 
-      line_buf* get_line() { return lines + 0; }
+      line_buf* get_line();
       void push_line();
       line_buf* pull_line();
       rect get_rect() { return res_rect; }
       ui32 get_comp_num() { return comp_num; }
+      bool has_horz_transform() { return (transform_flags & HORZ_TRX) != 0; }
+      bool has_vert_transform() { return (transform_flags & VERT_TRX) != 0; }
 
       ui32 prepare_precinct();
       void write_precincts(outfile_base *file);
@@ -90,14 +97,16 @@ namespace ojph {
 
     private:
       bool reversible, skipped_res_for_read, skipped_res_for_recon;
-      ui32 num_lines;
-      ui32 num_bands, res_num;
+      ui32 num_steps;
+      ui32 res_num;
       ui32 comp_num;
       ui32 num_bytes; // number of bytes in this resolution 
                       // used for tilepart length
       point comp_downsamp;
-      rect res_rect;
-      line_buf *lines;
+      rect res_rect;                             // resolution rectangle
+      line_buf* lines;                           // used to store lines
+      lifting_buf *ssp;                          // step state pointer
+      lifting_buf *aug, *sig;
       subband *bands;
       tile_comp *parent_comp;
       resolution *parent_res, *child_res;
@@ -109,8 +118,11 @@ namespace ojph {
       int tag_tree_size;
       ui32 level_index[20]; //more than enough
       point cur_precinct_loc; //used for progressing spatial modes (2, 3, 4)
+      const param_atk* atk;
+      ui32 transform_flags;
       //wavelet machinery
       ui32 cur_line;
+      ui32 rows_to_produce;
       bool vert_even, horz_even;
       mem_elastic_allocator *elastic;
     };
diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp
index b41294e8..b712f200 100644
--- a/src/core/codestream/ojph_subband.cpp
+++ b/src/core/codestream/ojph_subband.cpp
@@ -55,7 +55,7 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void subband::pre_alloc(codestream *codestream, const rect &band_rect,
-                            ui32 res_num)
+                            ui32 comp_num, ui32 res_num, ui32 transform_flags)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
 
@@ -63,12 +63,15 @@ namespace ojph {
       if (empty)
         return;
 
-      const param_cod* cdp = codestream->get_cod();
+      const param_cod* cdp = codestream->get_coc(comp_num);
       size log_cb = cdp->get_log_block_dims();
       size log_PP = cdp->get_log_precinct_size(res_num);
 
-      ui32 xcb_prime = ojph_min(log_cb.w, log_PP.w - (res_num?1:0));
-      ui32 ycb_prime = ojph_min(log_cb.h, log_PP.h - (res_num?1:0));
+      ui32 x_off = ((transform_flags & resolution::HORZ_TRX) ? 1 : 0);
+      ui32 y_off = ((transform_flags & resolution::VERT_TRX) ? 1 : 0);
+
+      ui32 xcb_prime = ojph_min(log_cb.w, log_PP.w - x_off);
+      ui32 ycb_prime = ojph_min(log_cb.h, log_PP.h - y_off);
 
       size nominal(1 << xcb_prime, 1 << ycb_prime);
 
@@ -85,16 +88,29 @@ namespace ojph {
 
       allocator->pre_alloc_obj<codeblock>(num_blocks.w);
       //allocate codeblock headers
-      allocator->pre_alloc_obj<coded_cb_header>(num_blocks.area());
+      allocator->pre_alloc_obj<coded_cb_header>((size_t)num_blocks.area());
+
+      const param_qcd* qp = codestream->access_qcd()->get_qcc(comp_num);
+      ui32 precision = qp->propose_precision(cdp);
+      const param_atk* atk = cdp->access_atk();
+      bool reversible = atk->is_reversible();
 
       for (ui32 i = 0; i < num_blocks.w; ++i)
-        codeblock::pre_alloc(codestream, nominal);
+        codeblock::pre_alloc(codestream, nominal, precision);
 
       //allocate lines
       allocator->pre_alloc_obj<line_buf>(1);
       //allocate line_buf
       ui32 width = band_rect.siz.w + 1;
-      allocator->pre_alloc_data<si32>(width, 1);
+      if (reversible)
+      {
+        if (precision <= 32)
+          allocator->pre_alloc_data<si32>(width, 1);
+        else
+          allocator->pre_alloc_data<si64>(width, 1);
+      }
+      else
+        allocator->pre_alloc_data<float>(width, 1);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -111,28 +127,41 @@ namespace ojph {
       this->band_rect = band_rect;
       this->parent = res;
 
-      const param_cod* cdp = codestream->get_cod();
-      this->reversible = cdp->is_reversible();
+      const param_cod* cdp = codestream->get_coc(parent->get_comp_num());
+      this->reversible = cdp->access_atk()->is_reversible();
       size log_cb = cdp->get_log_block_dims();
       log_PP = cdp->get_log_precinct_size(res_num);
 
-      xcb_prime = ojph_min(log_cb.w, log_PP.w - (res_num?1:0));
-      ycb_prime = ojph_min(log_cb.h, log_PP.h - (res_num?1:0));
+      ui32 x_off = ((parent->has_horz_transform()) ? 1 : 0);
+      ui32 y_off = ((parent->has_vert_transform()) ? 1 : 0);
+
+      xcb_prime = ojph_min(log_cb.w, log_PP.w - x_off);
+      ycb_prime = ojph_min(log_cb.h, log_PP.h - y_off);
 
       size nominal(1 << xcb_prime, 1 << ycb_prime);
 
       cur_cb_row = 0;
       cur_line = 0;
       cur_cb_height = 0;
-      param_qcd *qcd = codestream->access_qcd(parent->get_comp_num());
-      this->K_max = qcd->get_Kmax(this->res_num, band_num);
+      const param_dfs* dfs = NULL;
+      if (cdp->is_dfs_defined()) {
+        dfs = codestream->access_dfs();
+        if (dfs != NULL)
+          dfs = dfs->get_dfs(cdp->get_dfs_index());
+      }
+      ui32 comp_num = parent->get_comp_num();
+      const param_qcd* qcd = codestream->access_qcd()->get_qcc(comp_num);
+      ui32 num_decomps = cdp->get_num_decompositions();
+      this->K_max = qcd->get_Kmax(dfs, num_decomps, this->res_num, band_num);
       if (!reversible)
       {
-        float d = qcd->irrev_get_delta(res_num, subband_num);
+        float d = 
+          qcd->get_irrev_delta(dfs, num_decomps, res_num, subband_num);
         d /= (float)(1u << (31 - this->K_max));
         delta = d;
         delta_inv = (1.0f/d);
       }
+      ui32 precision = qcd->propose_precision(cdp);
 
       this->empty = ((band_rect.siz.w == 0) || (band_rect.siz.h == 0));
       if (this->empty)
@@ -152,8 +181,8 @@ namespace ojph {
       blocks = allocator->post_alloc_obj<codeblock>(num_blocks.w);
       //allocate codeblock headers
       coded_cb_header *cp = coded_cbs =
-        allocator->post_alloc_obj<coded_cb_header>(num_blocks.area());
-      memset(coded_cbs, 0, sizeof(coded_cb_header) * num_blocks.area());
+        allocator->post_alloc_obj<coded_cb_header>((size_t)num_blocks.area());
+      memset(coded_cbs, 0, sizeof(coded_cb_header) * (size_t)num_blocks.area());
       for (int i = (int)num_blocks.area(); i > 0; --i, ++cp)
         cp->Kmax = K_max;
 
@@ -170,7 +199,8 @@ namespace ojph {
         ui32 cbx1 = ojph_min(tbx1, x_lower_bound + (i + 1) * nominal.w);
         cb_size.w = cbx1 - cbx0;
         blocks[i].finalize_alloc(codestream, this, nominal, cb_size,
-                                  coded_cbs + i, K_max, line_offset);
+                                 coded_cbs + i, K_max, line_offset, 
+                                 precision, comp_num);
         line_offset += cb_size.w;
       }
 
@@ -178,7 +208,15 @@ namespace ojph {
       lines = allocator->post_alloc_obj<line_buf>(1);
       //allocate line_buf
       ui32 width = band_rect.siz.w + 1;
-      lines->wrap(allocator->post_alloc_data<si32>(width,1),width,1);
+      if (reversible)
+      {
+        if (precision <= 32)      
+          lines->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+        else
+          lines->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
+      }
+      else
+        lines->wrap(allocator->post_alloc_data<float>(width, 1), width, 1);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -197,14 +235,16 @@ namespace ojph {
       ui32 pc_lft = (res_rect.org.x >> log_PP.w) << log_PP.w;
       ui32 pc_top = (res_rect.org.y >> log_PP.h) << log_PP.h;
 
-      ui32 pcx0, pcx1, pcy0, pcy1, shift = (band_num != 0 ? 1 : 0);
+      ui32 pcx0, pcx1, pcy0, pcy1;
+      ui32 x_shift = parent->has_horz_transform() ? 1 : 0;
+      ui32 y_shift = parent->has_vert_transform() ? 1 : 0;
       ui32 yb, xb, coly = 0, colx = 0;
       for (ui32 y = 0; y < num_precincts.h; ++y)
       {
         pcy0 = ojph_max(try0, pc_top + (y << log_PP.h));
         pcy1 = ojph_min(try1, pc_top + ((y + 1) << log_PP.h));
-        pcy0 = (pcy0 - (band_num >> 1) + (1<<shift) - 1) >> shift;
-        pcy1 = (pcy1 - (band_num >> 1) + (1<<shift) - 1) >> shift;
+        pcy0 = (pcy0 - (band_num >> 1) + (1 << y_shift) - 1) >> y_shift;
+        pcy1 = (pcy1 - (band_num >> 1) + (1 << y_shift) - 1) >> y_shift;
 
         precinct *p = precincts + y * num_precincts.w;
         yb = ((pcy1 + (1<<ycb_prime) - 1) >> ycb_prime);
@@ -215,8 +255,8 @@ namespace ojph {
         {
           pcx0 = ojph_max(trx0, pc_lft + (x << log_PP.w));
           pcx1 = ojph_min(trx1, pc_lft + ((x + 1) << log_PP.w));
-          pcx0 = (pcx0 - (band_num & 1) + (1<<shift) - 1) >> shift;
-          pcx1 = (pcx1 - (band_num & 1) + (1<<shift) - 1) >> shift;
+          pcx0 = (pcx0 - (band_num & 1) + (1 << x_shift) - 1) >> x_shift;
+          pcx1 = (pcx1 - (band_num & 1) + (1 << x_shift) - 1) >> x_shift;
 
           rect *bp = p->cb_idxs + band_num;
           xb = ((pcx1 + (1<<xcb_prime) - 1) >> xcb_prime);
@@ -240,10 +280,11 @@ namespace ojph {
       if (empty)
         return;
 
-      assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size);
-      si32* t = lines[0].i32;
-      lines[0].i32 = l->i32;
-      l->i32 = t;
+      assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size &&
+             l->flags == lines[0].flags);
+      void* p = lines[0].p;
+      lines[0].p = l->p;
+      l->p = p;
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h
index 3bcc6edb..e1c291a3 100644
--- a/src/core/codestream/ojph_subband.h
+++ b/src/core/codestream/ojph_subband.h
@@ -45,7 +45,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_elastic_allocator;
   class codestream;
 
@@ -63,8 +63,25 @@ namespace ojph {
     {
       friend struct precinct;
     public:
+      subband() { 
+        res_num = band_num = 0;
+        reversible = false;
+        empty = true;             // <---- true
+        lines = NULL;
+        parent = NULL;
+        blocks = NULL;
+        xcb_prime = ycb_prime = 0;
+        cur_cb_row = 0;
+        cur_line = 0;
+        cur_cb_height = 0;
+        delta = delta_inv = 0.0f;
+        K_max = 0;
+        coded_cbs = NULL;
+        elastic = NULL;
+      }
+
       static void pre_alloc(codestream *codestream, const rect& band_rect,
-                            ui32 res_num);
+                            ui32 comp_num, ui32 res_num, ui32 transform_flags);
       void finalize_alloc(codestream *codestream, const rect& band_rect,
                           resolution* res, ui32 res_num, ui32 subband_num);
 
@@ -74,13 +91,17 @@ namespace ojph {
 
       void get_cb_indices(const size& num_precincts, precinct *precincts);
       float get_delta() { return delta; }
+      bool exists() { return !empty; }
 
       line_buf* pull_line();
+      resolution* get_parent() { return parent; }
+      const resolution* get_parent() const { return parent; }
 
     private:
+      bool empty;                  // true if the subband has no pixels or
+                                   // the subband is NOT USED
       ui32 res_num, band_num;
       bool reversible;
-      bool empty;
       rect band_rect;
       line_buf *lines;
       resolution* parent;
diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index 9fecd03b..1543c807 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -38,6 +38,7 @@
 
 #include <climits>
 #include <cmath>
+#include <exception>
 
 #include "ojph_mem.h"
 #include "ojph_params.h"
@@ -67,19 +68,38 @@ namespace ojph {
       allocator->pre_alloc_obj<ui32>(num_comps); //for line_offsets
       allocator->pre_alloc_obj<ui32>(num_comps); //for num_bits
       allocator->pre_alloc_obj<bool>(num_comps); //for is_signed
+      allocator->pre_alloc_obj<bool>(num_comps); //for reversible
+      allocator->pre_alloc_obj<ui8>(num_comps);  //for nlt_type3
       allocator->pre_alloc_obj<ui32>(num_comps); //for cur_line
 
-      ui32 tilepart_div = codestream->get_tilepart_div();
-      num_tileparts = 1; //for num_rc_bytes
-      // this code is not ideal, since the number of decompositions can be 
-      // different for different components
-      if (tilepart_div & OJPH_TILEPART_COMPONENTS)
-        num_tileparts *= num_comps;
-      if (tilepart_div & OJPH_TILEPART_RESOLUTIONS)
-        num_tileparts *= codestream->get_cod()->get_num_decompositions() + 1;
-      if (num_tileparts > 255)
-        OJPH_ERROR(0x000300D1, "Trying to create %d tileparts; a tile cannot "
-          "have more than 255 tile parts.", num_tileparts);
+      {
+        ui32 tilepart_div = codestream->get_tilepart_div();
+        ui32 t = tilepart_div & OJPH_TILEPART_MASK;
+        if (t == OJPH_TILEPART_NO_DIVISIONS)
+          num_tileparts = 1; //for num_rc_bytes
+        else if (t == OJPH_TILEPART_COMPONENTS)
+          num_tileparts = num_comps;
+        else if (t == OJPH_TILEPART_RESOLUTIONS)
+        {
+          ui32 max_decs = 0;
+          for (ui32 c = 0; c < num_comps; ++c) {
+            ui32 s = codestream->get_coc(c)->get_num_decompositions();
+            max_decs = ojph_max(max_decs, s);
+          }
+          num_tileparts = 1 + max_decs;
+        }
+        else if (t == (OJPH_TILEPART_COMPONENTS | OJPH_TILEPART_RESOLUTIONS))
+        {
+          num_tileparts = 0;
+          for (ui32 c = 0; c < num_comps; ++c) {
+            ui32 s = codestream->get_coc(c)->get_num_decompositions();
+            num_tileparts += s + 1;
+          }
+        }
+        if (num_tileparts > 255)
+          OJPH_ERROR(0x000300D1, "Trying to create %d tileparts; a tile "
+            "cannot have more than 255 tile parts.", num_tileparts);
+      }
 
       ui32 tx0 = tile_rect.org.x;
       ui32 ty0 = tile_rect.org.y;
@@ -116,23 +136,42 @@ namespace ojph {
         recon_comp_rect.siz.w = recon_tcx1 - recon_tcx0;
         recon_comp_rect.siz.h = recon_tcy1 - recon_tcy0;
 
-        tile_comp::pre_alloc(codestream, comp_rect, recon_comp_rect);
+        tile_comp::pre_alloc(codestream, i, comp_rect, recon_comp_rect);
         width = ojph_max(width, recon_comp_rect.siz.w);
       }
 
       //allocate lines
-      if (codestream->get_cod()->is_employing_color_transform())
+      const param_cod* cdp = codestream->get_cod();
+      if (cdp->is_employing_color_transform())
       {
+        bool reversible[3];
+        for (ui32 i = 0; i < 3; ++i)
+          reversible[i] = codestream->get_coc(i)->is_reversible();
+        if (reversible[0] != reversible[1] || reversible[1] != reversible[2])
+          OJPH_ERROR(0x000300A2, "When the colour transform is employed. "
+            "all colour components must undergo either reversible or "
+            "irreversible wavelet transform; if not, then it is not clear "
+            "what colour transform should be used (reversible or "
+            "irreversible).  Here we found that the first three colour "
+            "components uses %s, %s, and %s transforms, respectively.",
+            reversible[0] ? "reversible" : "irreversible",
+            reversible[1] ? "reversible" : "irreversible",
+            reversible[2] ? "reversible" : "irreversible");
+
         allocator->pre_alloc_obj<line_buf>(3);
-        for (int i = 0; i < 3; ++i)
-          allocator->pre_alloc_data<si32>(width, 0);
+        if (reversible[0])
+          for (int i = 0; i < 3; ++i)
+            allocator->pre_alloc_data<si32>(width, 0);
+        else
+          for (int i = 0; i < 3; ++i)
+            allocator->pre_alloc_data<float>(width, 0);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
     void tile::finalize_alloc(codestream *codestream, const rect& tile_rect,
-                              const rect& recon_tile_rect, ui32 tile_idx, 
-                              ui32 offset, ui32 &num_tileparts)
+                              ui32 tile_idx, ui32& offset,
+                              ui32 &num_tileparts)
     {
       //this->parent = codestream;
       mem_fixed_allocator* allocator = codestream->get_allocator();
@@ -142,6 +181,7 @@ namespace ojph {
 
       //allocate tiles_comp
       const param_siz *szp = codestream->get_siz();
+      const param_nlt *nlp = codestream->get_nlt();
 
       this->num_bytes = 0;
       num_comps = szp->get_num_components();
@@ -152,48 +192,69 @@ namespace ojph {
       line_offsets = allocator->post_alloc_obj<ui32>(num_comps);
       num_bits = allocator->post_alloc_obj<ui32>(num_comps);
       is_signed = allocator->post_alloc_obj<bool>(num_comps);
+      reversible = allocator->post_alloc_obj<bool>(num_comps);
+      nlt_type3 = allocator->post_alloc_obj<ui8>(num_comps);
       cur_line = allocator->post_alloc_obj<ui32>(num_comps);
 
       profile = codestream->get_profile();
       tilepart_div = codestream->get_tilepart_div();
       need_tlm = codestream->is_tlm_needed();
-      num_tileparts = 1;
-      // this code is not ideal, since the number of decompositions can be 
-      // different for different components
-      if (tilepart_div & OJPH_TILEPART_COMPONENTS)
-        num_tileparts *= num_comps;
-      if (tilepart_div & OJPH_TILEPART_RESOLUTIONS)
-        num_tileparts *= codestream->get_cod()->get_num_decompositions() + 1;
+      {
+        ui32 tilepart_div = codestream->get_tilepart_div();
+        ui32 t = tilepart_div & OJPH_TILEPART_MASK;
+        if (t == OJPH_TILEPART_NO_DIVISIONS)
+          num_tileparts = 1; //for num_rc_bytes
+        else if (t == OJPH_TILEPART_COMPONENTS)
+          num_tileparts = num_comps;
+        else if (t == OJPH_TILEPART_RESOLUTIONS)
+        {
+          ui32 max_decs = 0;
+          for (ui32 c = 0; c < num_comps; ++c) {
+            ui32 s = codestream->get_coc(c)->get_num_decompositions();
+            max_decs = ojph_max(max_decs, s);
+          }
+          num_tileparts = 1 + max_decs;
+        }
+        else if (t == (OJPH_TILEPART_COMPONENTS | OJPH_TILEPART_RESOLUTIONS))
+        {
+          num_tileparts = 0;
+          for (ui32 c = 0; c < num_comps; ++c) {
+            ui32 s = codestream->get_coc(c)->get_num_decompositions();
+            num_tileparts += s + 1;
+          }
+        }
+        if (num_tileparts > 255)
+          OJPH_ERROR(0x000300D1, "Trying to create %d tileparts; a tile "
+          "cannot have more than 255 tile parts.", num_tileparts);
+      }
 
       this->resilient = codestream->is_resilient();
       this->tile_rect = tile_rect;
-      this->recon_tile_rect = recon_tile_rect;
 
       ui32 tx0 = tile_rect.org.x;
       ui32 ty0 = tile_rect.org.y;
       ui32 tx1 = tile_rect.org.x + tile_rect.siz.w;
       ui32 ty1 = tile_rect.org.y + tile_rect.siz.h;
-      ui32 recon_tx0 = recon_tile_rect.org.x;
-      ui32 recon_ty0 = recon_tile_rect.org.y;
-      ui32 recon_tx1 = recon_tile_rect.org.x + recon_tile_rect.siz.w;
-      ui32 recon_ty1 = recon_tile_rect.org.y + recon_tile_rect.siz.h;
 
       ui32 width = 0;
       for (ui32 i = 0; i < num_comps; ++i)
       {
+        ui8 bd; bool is; // used for nlt_type3
+
         point downsamp = szp->get_downsampling(i);
+        point recon_downsamp = szp->get_recon_downsampling(i);
 
         ui32 tcx0 = ojph_div_ceil(tx0, downsamp.x);
         ui32 tcy0 = ojph_div_ceil(ty0, downsamp.y);
         ui32 tcx1 = ojph_div_ceil(tx1, downsamp.x);
         ui32 tcy1 = ojph_div_ceil(ty1, downsamp.y);
-        ui32 recon_tcx0 = ojph_div_ceil(recon_tx0, downsamp.x);
-        ui32 recon_tcy0 = ojph_div_ceil(recon_ty0, downsamp.y);
-        ui32 recon_tcx1 = ojph_div_ceil(recon_tx1, downsamp.x);
-        ui32 recon_tcy1 = ojph_div_ceil(recon_ty1, downsamp.y);
+        ui32 recon_tcx0 = ojph_div_ceil(tx0, recon_downsamp.x);
+        ui32 recon_tcy0 = ojph_div_ceil(ty0, recon_downsamp.y);
+        ui32 recon_tcx1 = ojph_div_ceil(tx1, recon_downsamp.x);
+        ui32 recon_tcy1 = ojph_div_ceil(ty1, recon_downsamp.y);
 
-        line_offsets[i] = 
-          recon_tcx0 - ojph_div_ceil(recon_tx0 - offset, downsamp.x);
+        line_offsets[i] =
+          recon_tcx0 - ojph_div_ceil(tx0 - offset, recon_downsamp.x);
         comp_rects[i].org.x = tcx0;
         comp_rects[i].org.y = tcy0;
         comp_rects[i].siz.w = tcx1 - tcx0;
@@ -203,26 +264,42 @@ namespace ojph {
         recon_comp_rects[i].siz.w = recon_tcx1 - recon_tcx0;
         recon_comp_rects[i].siz.h = recon_tcy1 - recon_tcy0;
 
-        comps[i].finalize_alloc(codestream, this, i, comp_rects[i], 
+        comps[i].finalize_alloc(codestream, this, i, comp_rects[i],
           recon_comp_rects[i]);
         width = ojph_max(width, recon_comp_rects[i].siz.w);
 
         num_bits[i] = szp->get_bit_depth(i);
         is_signed[i] = szp->is_signed(i);
+        bool result = nlp->get_nonlinear_transform(i, bd, is, nlt_type3[i]);
+        if (result == true && (bd != num_bits[i] || is != is_signed[i]))
+          OJPH_ERROR(0x000300A1, "Mismatch between Ssiz (bit_depth = %d, "
+            "is_signed = %s) from SIZ marker segment, and BDnlt "
+            "(bit_depth = %d, is_signed = %s) from NLT marker segment, "
+            "for component %d", i, num_bits[i],
+            is_signed[i] ? "True" : "False", bd, is ? "True" : "False");
+        if (result == false)
+          nlt_type3[i] = param_nlt::nonlinearity::OJPH_NLT_NO_NLT;
         cur_line[i] = 0;
+        reversible[i] = codestream->get_coc(i)->is_reversible();
       }
 
+      offset += tile_rect.siz.w;
+
       //allocate lines
       const param_cod* cdp = codestream->get_cod();
-      this->reversible = cdp->is_reversible();
       this->employ_color_transform = cdp->is_employing_color_transform();
       if (this->employ_color_transform)
       {
         num_lines = 3;
         lines = allocator->post_alloc_obj<line_buf>(num_lines);
-        for (int i = 0; i < 3; ++i)
-          lines[i].wrap(
-            allocator->post_alloc_data<si32>(width,0),width,0);
+        if (reversible[0])
+          for (int i = 0; i < 3; ++i)
+            lines[i].wrap(
+              allocator->post_alloc_data<si32>(width, 0), width, 0);
+        else
+          for (int i = 0; i < 3; ++i)
+            lines[i].wrap(
+              allocator->post_alloc_data<float>(width, 0), width, 0);
       }
       else
       {
@@ -235,6 +312,9 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     bool tile::push(line_buf *line, ui32 comp_num)
     {
+      constexpr ui8 type3 =
+        param_nlt::nonlinearity::OJPH_NLT_BINARY_COMPLEMENT_NLT;
+
       assert(comp_num < num_comps);
       if (cur_line[comp_num] >= comp_rects[comp_num].siz.h)
         return false;
@@ -247,46 +327,50 @@ namespace ojph {
         assert(comp_num < num_comps);
         ui32 comp_width = comp_rects[comp_num].siz.w;
         line_buf *tc = comps[comp_num].get_line();
-        if (reversible)
+        if (reversible[comp_num])
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp = line->i32 + line_offsets[comp_num];
-          si32* dp = tc->i32;
-          if (is_signed[comp_num])
-            memcpy(dp, sp, comp_width * sizeof(si32));
-          else
-            cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width);
+          si64 shift = (si64)1 << (num_bits[comp_num] - 1);
+          if (is_signed[comp_num] && nlt_type3[comp_num] == type3)
+            rev_convert_nlt_type3(line, line_offsets[comp_num],
+              tc, 0, shift + 1, comp_width);
+          else {
+            shift = is_signed[comp_num] ? 0 : -shift;
+            rev_convert(line, line_offsets[comp_num], tc, 0,
+              shift, comp_width);
+          }
         }
         else
         {
-          float mul = 1.0f / (float)(1<<num_bits[comp_num]);
-          const si32 *sp = line->i32 + line_offsets[comp_num];
-          float *dp = tc->f32;
-          if (is_signed[comp_num])
-            cnvrt_si32_to_float(sp, dp, mul, comp_width);
+          if (nlt_type3[comp_num] == type3)
+            irv_convert_to_float_nlt_type3(line, line_offsets[comp_num],
+              tc, num_bits[comp_num], is_signed[comp_num], comp_width);
           else
-            cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width);
+            irv_convert_to_float(line, line_offsets[comp_num],
+              tc, num_bits[comp_num], is_signed[comp_num], comp_width);
         }
         comps[comp_num].push_line();
       }
       else
       {
+        si64 shift = (si64)1 << (num_bits[comp_num] - 1);
         ui32 comp_width = comp_rects[comp_num].siz.w;
-        if (reversible)
+        if (reversible[comp_num])
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp = line->i32 + line_offsets[comp_num];
-          si32 *dp = lines[comp_num].i32;
-          if (is_signed[comp_num])
-            memcpy(dp, sp, comp_width * sizeof(si32));
-          else
-            cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width);
+          if (is_signed[comp_num] && nlt_type3[comp_num] == type3)
+            rev_convert_nlt_type3(line, line_offsets[comp_num],
+              lines + comp_num, 0, shift + 1, comp_width);
+          else {
+            shift = is_signed[comp_num] ? 0 : -shift;
+            rev_convert(line, line_offsets[comp_num], lines + comp_num, 0,
+              shift, comp_width);
+          }
+
           if (comp_num == 2)
           { // reversible color transform
-            rct_forward(lines[0].i32, lines[1].i32, lines[2].i32,
-                        comps[0].get_line()->i32,
-                        comps[1].get_line()->i32,
-                        comps[2].get_line()->i32, comp_width);
+            rct_forward(lines + 0, lines + 1, lines + 2,
+                        comps[0].get_line(),
+                        comps[1].get_line(),
+                        comps[2].get_line(), comp_width);
                         comps[0].push_line();
                         comps[1].push_line();
                         comps[2].push_line();
@@ -294,13 +378,14 @@ namespace ojph {
         }
         else
         {
-          float mul = 1.0f / (float)(1<<num_bits[comp_num]);
-          const si32 *sp = line->i32 + line_offsets[comp_num];
-          float *dp = lines[comp_num].f32;
-          if (is_signed[comp_num])
-            cnvrt_si32_to_float(sp, dp, mul, comp_width);
+          if (nlt_type3[comp_num] == type3)
+            irv_convert_to_float_nlt_type3(line, line_offsets[comp_num],
+              lines + comp_num, num_bits[comp_num], is_signed[comp_num],
+              comp_width);
           else
-            cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width);
+            irv_convert_to_float(line, line_offsets[comp_num],
+              lines + comp_num, num_bits[comp_num], is_signed[comp_num],
+              comp_width);
           if (comp_num == 2)
           { // irreversible color transform
             ict_forward(lines[0].f32, lines[1].f32, lines[2].f32,
@@ -320,79 +405,92 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     bool tile::pull(line_buf* tgt_line, ui32 comp_num)
     {
+      constexpr ui8 type3 =
+        param_nlt::nonlinearity::OJPH_NLT_BINARY_COMPLEMENT_NLT;
+
       assert(comp_num < num_comps);
       if (cur_line[comp_num] >= recon_comp_rects[comp_num].siz.h)
         return false;
 
       cur_line[comp_num]++;
 
+      ui32 comp_width = recon_comp_rects[comp_num].siz.w;
+      if (comp_width == 0)
+        return true; // nothing to pull, but not an error
+
       if (!employ_color_transform || num_comps == 1)
       {
         line_buf *src_line = comps[comp_num].pull_line();
-        ui32 comp_width = recon_comp_rects[comp_num].siz.w;
-        if (reversible)
+        if (reversible[comp_num])
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp = src_line->i32;
-          si32* dp = tgt_line->i32 + line_offsets[comp_num];
-          if (is_signed[comp_num])
-            memcpy(dp, sp, comp_width * sizeof(si32));
-          else
-            cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width);
+          si64 shift = (si64)1 << (num_bits[comp_num] - 1);
+          if (is_signed[comp_num] && nlt_type3[comp_num] == type3)
+            rev_convert_nlt_type3(src_line, 0, tgt_line,
+              line_offsets[comp_num], shift + 1, comp_width);
+          else {
+            shift = is_signed[comp_num] ? 0 : shift;
+            rev_convert(src_line, 0, tgt_line,
+              line_offsets[comp_num], shift, comp_width);
+          }
         }
         else
         {
-          float mul = (float)(1 << num_bits[comp_num]);
-          const float *sp = src_line->f32;
-          si32 *dp = tgt_line->i32 + line_offsets[comp_num];
-          if (is_signed[comp_num])
-            cnvrt_float_to_si32(sp, dp, mul, comp_width);
+          if (nlt_type3[comp_num] == type3)
+            irv_convert_to_integer_nlt_type3(src_line, tgt_line,
+              line_offsets[comp_num], num_bits[comp_num],
+              is_signed[comp_num], comp_width);
           else
-            cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width);
+            irv_convert_to_integer(src_line, tgt_line,
+              line_offsets[comp_num], num_bits[comp_num],
+              is_signed[comp_num], comp_width);
         }
       }
       else
       {
         assert(num_comps >= 3);
-        ui32 comp_width = recon_comp_rects[comp_num].siz.w;
         if (comp_num == 0)
         {
-          if (reversible)
-            rct_backward(comps[0].pull_line()->i32, comps[1].pull_line()->i32,
-              comps[2].pull_line()->i32, lines[0].i32, lines[1].i32,
-              lines[2].i32, comp_width);
+          if (reversible[comp_num])
+            rct_backward(comps[0].pull_line(), comps[1].pull_line(),
+              comps[2].pull_line(), lines + 0, lines + 1,
+              lines + 2, comp_width);
           else
             ict_backward(comps[0].pull_line()->f32, comps[1].pull_line()->f32,
               comps[2].pull_line()->f32, lines[0].f32, lines[1].f32,
               lines[2].f32, comp_width);
         }
-        if (reversible)
+        if (reversible[comp_num])
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp;
+          si64 shift = (si64)1 << (num_bits[comp_num] - 1);
+          line_buf* src_line;
           if (comp_num < 3)
-            sp = lines[comp_num].i32;
+            src_line = lines + comp_num;
           else
-            sp = comps[comp_num].pull_line()->i32;
-          si32* dp = tgt_line->i32 + line_offsets[comp_num];
-          if (is_signed[comp_num])
-            memcpy(dp, sp, comp_width * sizeof(si32));
-          else
-            cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width);
+            src_line = comps[comp_num].pull_line();
+          if (is_signed[comp_num] && nlt_type3[comp_num] == type3)
+            rev_convert_nlt_type3(src_line, 0, tgt_line,
+              line_offsets[comp_num], shift + 1, comp_width);
+          else {
+            shift = is_signed[comp_num] ? 0 : shift;
+            rev_convert(src_line, 0, tgt_line,
+              line_offsets[comp_num], shift, comp_width);
+          }
         }
         else
         {
-          float mul = (float)(1 << num_bits[comp_num]);
-          const float *sp;
+          line_buf* lbp;
           if (comp_num < 3)
-            sp = lines[comp_num].f32;
+            lbp = lines + comp_num;
           else
-            sp = comps[comp_num].pull_line()->f32;
-          si32 *dp = tgt_line->i32 + line_offsets[comp_num];
-          if (is_signed[comp_num])
-            cnvrt_float_to_si32(sp, dp, mul, comp_width);
+            lbp = comps[comp_num].pull_line();
+          if (nlt_type3[comp_num] == type3)
+            irv_convert_to_integer_nlt_type3(lbp, tgt_line,
+              line_offsets[comp_num], num_bits[comp_num],
+              is_signed[comp_num], comp_width);
           else
-            cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width);
+            irv_convert_to_integer(lbp, tgt_line,
+              line_offsets[comp_num], num_bits[comp_num],
+              is_signed[comp_num], comp_width);
         }
       }
 
@@ -412,52 +510,52 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void tile::fill_tlm(param_tlm *tlm)
     {
-      if (tilepart_div == OJPH_TILEPART_NODIVSIONS) {
+      if (tilepart_div == OJPH_TILEPART_NO_DIVISIONS) {
         tlm->set_next_pair(sot.get_tile_index(), this->num_bytes);
       }
       else if (tilepart_div == OJPH_TILEPART_RESOLUTIONS)
-      { 
+      {
         assert(prog_order != OJPH_PO_PCRL && prog_order != OJPH_PO_CPRL);
         ui32 max_decs = 0;
         for (ui32 c = 0; c < num_comps; ++c)
           max_decs = ojph_max(max_decs, comps[c].get_num_decompositions());
-        for (ui32 r = 0; r <= max_decs; ++r) 
+        for (ui32 r = 0; r <= max_decs; ++r)
         {
           ui32 bytes = 0;
           for (ui32 c = 0; c < num_comps; ++c)
             bytes += comps[c].get_num_bytes(r);
           tlm->set_next_pair(sot.get_tile_index(), bytes);
         }
-      }      
+      }
       else if (tilepart_div == OJPH_TILEPART_COMPONENTS)
       {
         if (prog_order == OJPH_PO_LRCP || prog_order == OJPH_PO_RLCP)
-        { 
+        {
           ui32 max_decs = 0;
           for (ui32 c = 0; c < num_comps; ++c)
             max_decs = ojph_max(max_decs, comps[c].get_num_decompositions());
-          for (ui32 r = 0; r <= max_decs; ++r) 
+          for (ui32 r = 0; r <= max_decs; ++r)
             for (ui32 c = 0; c < num_comps; ++c)
               if (r <= comps[c].get_num_decompositions())
-                tlm->set_next_pair(sot.get_tile_index(), 
+                tlm->set_next_pair(sot.get_tile_index(),
                                    comps[c].get_num_bytes(r));
         }
         else if (prog_order == OJPH_PO_CPRL)
           for (ui32 c = 0; c < num_comps; ++c)
             tlm->set_next_pair(sot.get_tile_index(), comps[c].get_num_bytes());
-        else 
+        else
           assert(0); // should not be here
       }
-      else 
+      else
       {
         assert(prog_order == OJPH_PO_LRCP || prog_order == OJPH_PO_RLCP);
         ui32 max_decs = 0;
         for (ui32 c = 0; c < num_comps; ++c)
           max_decs = ojph_max(max_decs, comps[c].get_num_decompositions());
-        for (ui32 r = 0; r <= max_decs; ++r) 
+        for (ui32 r = 0; r <= max_decs; ++r)
           for (ui32 c = 0; c < num_comps; ++c)
             if (r <= comps[c].get_num_decompositions())
-              tlm->set_next_pair(sot.get_tile_index(), 
+              tlm->set_next_pair(sot.get_tile_index(),
                                  comps[c].get_num_bytes(r));
       }
     }
@@ -471,7 +569,7 @@ namespace ojph {
         max_decompositions = ojph_max(max_decompositions,
           comps[c].get_num_decompositions());
 
-      if (tilepart_div == OJPH_TILEPART_NODIVSIONS)
+      if (tilepart_div == OJPH_TILEPART_NO_DIVISIONS)
       {
         //write tile header
         if (!sot.write(file, this->num_bytes))
@@ -487,15 +585,15 @@ namespace ojph {
       //sequence the writing of precincts according to progression order
       if (prog_order == OJPH_PO_LRCP || prog_order == OJPH_PO_RLCP)
       {
-        if (tilepart_div == OJPH_TILEPART_NODIVSIONS) 
+        if (tilepart_div == OJPH_TILEPART_NO_DIVISIONS)
         {
           for (ui32 r = 0; r <= max_decompositions; ++r)
             for (ui32 c = 0; c < num_comps; ++c)
               comps[c].write_precincts(r, file);
         }
-        else if (tilepart_div == OJPH_TILEPART_RESOLUTIONS) 
+        else if (tilepart_div == OJPH_TILEPART_RESOLUTIONS)
         {
-          for (ui32 r = 0; r <= max_decompositions; ++r) 
+          for (ui32 r = 0; r <= max_decompositions; ++r)
           {
             ui32 bytes = 0;
             for (ui32 c = 0; c < num_comps; ++c)
@@ -509,26 +607,26 @@ namespace ojph {
             ui16 t = swap_byte(JP2K_MARKER::SOD);
             if (!file->write(&t, 2))
               OJPH_ERROR(0x00030084, "Error writing to file");
-            
+
             //write precincts
             for (ui32 c = 0; c < num_comps; ++c)
-              comps[c].write_precincts(r, file);              
+              comps[c].write_precincts(r, file);
           }
         }
-        else 
+        else
         {
           ui32 num_tileparts = num_comps * (max_decompositions + 1);
           for (ui32 r = 0; r <= max_decompositions; ++r)
             for (ui32 c = 0; c < num_comps; ++c)
               if (r <= comps[c].get_num_decompositions()) {
                 //write tile header
-                if (!sot.write(file, comps[c].get_num_bytes(r), 
+                if (!sot.write(file, comps[c].get_num_bytes(r),
                                (ui8)(c + r * num_comps), (ui8)num_tileparts))
                   OJPH_ERROR(0x00030085, "Error writing to file");
                 //write start of data
                 ui16 t = swap_byte(JP2K_MARKER::SOD);
                 if (!file->write(&t, 2))
-                  OJPH_ERROR(0x00030086, "Error writing to file");                
+                  OJPH_ERROR(0x00030086, "Error writing to file");
                 comps[c].write_precincts(r, file);
               }
         }
@@ -790,15 +888,22 @@ namespace ojph {
           assert(0);
 
       }
-      catch (const char *error)
+      catch (const std::exception& error)
+      {
+        if (resilient)
+          OJPH_INFO(0x00030092, "%s", error.what())
+        else
+          OJPH_ERROR(0x00030092, "%s", error.what())
+      }
+      catch (...)
       {
         if (resilient)
-          OJPH_INFO(0x00030092, "%s", error)
+          OJPH_INFO(0x00030092, "Unknown exception while parsing tile header")
         else
-          OJPH_ERROR(0x00030092, "%s", error)
+          OJPH_ERROR(0x00030092, "Unknown exception while parsing tile header")
       }
       file->seek((si64)tile_end_location, infile_base::OJPH_SEEK_SET);
     }
 
   }
-}
\ No newline at end of file
+}
diff --git a/src/core/codestream/ojph_tile.h b/src/core/codestream/ojph_tile.h
index b00c8181..03143619 100644
--- a/src/core/codestream/ojph_tile.h
+++ b/src/core/codestream/ojph_tile.h
@@ -47,7 +47,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class codestream;
 
   namespace local {
@@ -63,8 +63,7 @@ namespace ojph {
       static void pre_alloc(codestream *codestream, const rect& tile_rect,
                             const rect& recon_tile_rect, ui32 &num_tileparts);
       void finalize_alloc(codestream *codestream, const rect& tile_rect,
-                          const rect& recon_tile_rect, ui32 tile_idx, 
-                          ui32 offset, ui32 &num_tileparts);
+                          ui32 tile_idx, ui32& offset, ui32 &num_tileparts);
 
       bool push(line_buf *line, ui32 comp_num);
       void prepare_for_flush();
@@ -77,12 +76,13 @@ namespace ojph {
 
     private:
       //codestream *parent;
-      rect tile_rect, recon_tile_rect;
+      rect tile_rect;
       ui32 num_comps;
       tile_comp *comps;
       ui32 num_lines;
       line_buf* lines;
-      bool reversible, employ_color_transform, resilient;
+      bool employ_color_transform, resilient;
+      bool *reversible;
       rect *comp_rects, *recon_comp_rects;
       ui32 *line_offsets;
       ui32 skipped_res_for_read;
@@ -90,6 +90,7 @@ namespace ojph {
       ui32 *num_bits;
       bool *is_signed;
       ui32 *cur_line;
+      ui8 *nlt_type3;
       int prog_order;
 
     private:
diff --git a/src/core/codestream/ojph_tile_comp.cpp b/src/core/codestream/ojph_tile_comp.cpp
index a2124e8b..a807769e 100644
--- a/src/core/codestream/ojph_tile_comp.cpp
+++ b/src/core/codestream/ojph_tile_comp.cpp
@@ -51,16 +51,18 @@ namespace ojph {
   {
 
     //////////////////////////////////////////////////////////////////////////
-    void tile_comp::pre_alloc(codestream *codestream, const rect& comp_rect,
+    void tile_comp::pre_alloc(codestream *codestream, ui32 comp_num, 
+                              const rect& comp_rect,
                               const rect& recon_comp_rect)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
 
       //allocate a resolution
-      ui32 num_decomps = codestream->access_cod().get_num_decompositions();
+      ui32 num_decomps;
+      num_decomps = codestream->get_coc(comp_num)->get_num_decompositions();
       allocator->pre_alloc_obj<resolution>(1);
 
-      resolution::pre_alloc(codestream, comp_rect, recon_comp_rect, 
+      resolution::pre_alloc(codestream, comp_rect, recon_comp_rect, comp_num, 
                             num_decomps);
     }
 
@@ -72,7 +74,7 @@ namespace ojph {
       mem_fixed_allocator* allocator = codestream->get_allocator();
 
       //allocate a resolution
-      num_decomps = codestream->get_cod()->get_num_decompositions();
+      num_decomps = codestream->get_coc(comp_num)->get_num_decompositions();
 
       comp_downsamp = codestream->get_siz()->get_downsampling(comp_num);
       this->comp_rect = comp_rect;
@@ -82,7 +84,8 @@ namespace ojph {
       this->num_bytes = 0;
       res = allocator->post_alloc_obj<resolution>(1);
       res->finalize_alloc(codestream, comp_rect, recon_comp_rect, comp_num,
-                          num_decomps, comp_downsamp, this, NULL);
+                          num_decomps, comp_downsamp, comp_downsamp, this, 
+                          NULL);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -128,13 +131,12 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     bool tile_comp::get_top_left_precinct(ui32 res_num, point &top_left)
     {
-      assert(res_num <= num_decomps);
-      res_num = num_decomps - res_num;
+      int resolution_num = (int)num_decomps - (int)res_num;
       resolution *r = res;
-      while (res_num > 0 && r != NULL)
+       while (resolution_num > 0 && r != NULL)
       {
         r = r->next_resolution();
-        --res_num;
+        --resolution_num;
       }
       if (r) //resolution does not exist if r is NULL
         return r->get_top_left_precinct(top_left);
@@ -145,13 +147,12 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void tile_comp::write_one_precinct(ui32 res_num, outfile_base *file)
     {
-      assert(res_num <= num_decomps);
-      res_num = num_decomps - res_num;
+      int resolution_num = (int)num_decomps - (int)res_num;
       resolution *r = res;
-      while (res_num > 0 && r != NULL)
+      while (resolution_num > 0 && r != NULL)
       {
         r = r->next_resolution();
-        --res_num;
+        --resolution_num;
       }
       if (r) //resolution does not exist if r is NULL
         r->write_one_precinct(file);
diff --git a/src/core/codestream/ojph_tile_comp.h b/src/core/codestream/ojph_tile_comp.h
index d7304d96..62b8fba2 100644
--- a/src/core/codestream/ojph_tile_comp.h
+++ b/src/core/codestream/ojph_tile_comp.h
@@ -48,7 +48,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class codestream;
 
   namespace local {
@@ -62,7 +62,8 @@ namespace ojph {
     class tile_comp
     {
     public:
-      static void pre_alloc(codestream *codestream, const rect& comp_rect,
+      static void pre_alloc(codestream *codestream, ui32 comp_num, 
+                            const rect& comp_rect,
                             const rect& recon_comp_rect);
       void finalize_alloc(codestream *codestream, tile *parent,
                           ui32 comp_num, const rect& comp_rect,
diff --git a/src/core/coding/ojph_block_common.cpp b/src/core/coding/ojph_block_common.cpp
index 6e1b53fd..2ba138a9 100644
--- a/src/core/coding/ojph_block_common.cpp
+++ b/src/core/coding/ojph_block_common.cpp
@@ -55,13 +55,13 @@ namespace ojph {
      *  \li \c cwd_len : 3bits -> the codeword length of the VLC codeword;    
      *                   the VLC cwd is in the LSB of bitstream              \n
      *  \li \c u_off   : 1bit  -> u_offset, which is 1 if u value is not 0   \n
-     *  \li \c rho     : 4bits -> signficant samples within a quad           \n
+     *  \li \c rho     : 4bits -> significant samples within a quad           \n
      *  \li \c e_1     : 4bits -> EMB e_1                                    \n
      *  \li \c e_k     : 4bits -> EMB e_k                                    \n
      *                                                                       \n
      *  The table index is 10 bits and composed of two parts:                \n
      *  The 7 LSBs contain a codeword which might be shorter than 7 bits;    
-     *  this word is the next decoable bits in the bitstream.                \n
+     *  this word is the next decodable bits in the bitstream.                \n
      *  The 3 MSB is the context of for the codeword.                        \n
      */
 
@@ -75,7 +75,7 @@ namespace ojph {
     //************************************************************************/
     /** @defgroup uvlc_decoding_tables_grp VLC decoding tables
      *  @{
-     *  UVLC decoding tables used to partiallu decode u values from UVLC     
+     *  UVLC decoding tables used to partially decode u values from UVLC     
      *  codewords.                                                           \n
      *  The table index is 8 (or 9)  bits and composed of two parts:         \n
      *  The 6 LSBs carries the head of the VLC to be decoded. Up to 6 bits to 
@@ -84,11 +84,20 @@ namespace ojph {
      *  + 4 * mel event for initial row of quads when needed                 \n
      *                                                                       \n
      *  Each entry contains, starting from the LSB                           \n
-     *  \li \c total prefix length for quads 0 and 1 (3 bits)                \n
-     *  \li \c total suffix length for quads 0 and 1 (4 bits)                \n
+     *  \li \c total total prefix length for quads 0 and 1 (3 bits)          \n
+     *  \li \c total total suffix length for quads 0 and 1 (4 bits)          \n
      *  \li \c suffix length for quad 0 (3 bits)                             \n
      *  \li \c prefix for quad 0 (3 bits)                                    \n
      *  \li \c prefix for quad 1 (3 bits)                                    \n
+     *                                                                       \n
+     *  Another table is uvlc_bias, which is needed to correctly decode the 
+     *  extension u_ext for initial row of quads. Under certain condition,
+     *  we deduct 1 or 2 from u_q0 and u_q1 before encoding them; so for us 
+     *  to know that decoding u_ext is needed, we recreate the u_q0 and u_q1
+     *  that we actually encoded.                                            \n
+     *  For simplicity, we use the same index as before                      \n
+     *  \li \c u_q0 bias is 2 bits                                           \n
+     *  \li \c u_q1 bias is 2 bits                                           \n
      */
 
     /// @brief uvlc_tbl0 contains decoding information for initial row of quads
@@ -96,6 +105,8 @@ namespace ojph {
     /// @brief uvlc_tbl1 contains decoding information for non-initial row of 
     ///        quads
     ui16 uvlc_tbl1[256] = { 0 };
+    /// @brief uvlc_bias contains decoding info. for initial row of quads
+    ui8 uvlc_bias[256+64] = { 0 };
     /// @}
 
     //************************************************************************/
@@ -109,7 +120,7 @@ namespace ojph {
 
       //Data in the table is arranged in this format (taken from the standard)
       // c_q is the context for a quad
-      // rho is the signficance pattern for a quad
+      // rho is the significance pattern for a quad
       // u_off indicate if u value is 0 (u_off is 0), or communicated
       // e_k, e_1 EMB patterns
       // cwd VLC codeword
@@ -132,7 +143,7 @@ namespace ojph {
       if (debug) memset(vlc_tbl0, 0, sizeof(vlc_tbl0)); //unnecessary
 
       // this is to convert table entries into values for decoder look up
-      // There can be at most 1024 possibilites, not all of them are valid.
+      // There can be at most 1024 possibilities, not all of them are valid.
       // 
       for (int i = 0; i < 1024; ++i)
       {
@@ -199,8 +210,10 @@ namespace ojph {
         ui32 mode = i >> 6;
         ui32 vlc = i & 0x3F;
 
-        if (mode == 0)      // both u_off are 0
+        if (mode == 0) {      // both u_off are 0
           uvlc_tbl0[i] = 0;
+          uvlc_bias[i] = 0;
+        }
         else if (mode <= 2) // u_off are either 01 or 10
         {
           ui32 d = dec[vlc & 0x7];   //look at the least significant 3 bits
@@ -232,6 +245,7 @@ namespace ojph {
             total_suffix = u0_suffix_len;
             u0 = d0 >> 5;
             u1 = (vlc & 1) + 1;
+            uvlc_bias[i] = 4; // 0b00 for u0 and 0b01 for u1
           }
           else
           {
@@ -240,6 +254,7 @@ namespace ojph {
             total_suffix = u0_suffix_len + ((d1 >> 2) & 0x7);
             u0 = d0 >> 5;
             u1 = d1 >> 5;
+            uvlc_bias[i] = 0;
           }
 
           uvlc_tbl0[i] = (ui16)(total_prefix | 
@@ -265,6 +280,7 @@ namespace ojph {
                                (u0_suffix_len << 7) |
                                (u0 << 10) |
                                (u1 << 13));
+          uvlc_bias[i] = 10; // 0b10 for u0 and 0b10 for u1
         }
       }
 
diff --git a/src/core/coding/ojph_block_common.h b/src/core/coding/ojph_block_common.h
index 29a84bad..f8d65032 100644
--- a/src/core/coding/ojph_block_common.h
+++ b/src/core/coding/ojph_block_common.h
@@ -44,6 +44,6 @@ namespace ojph{
     extern ui16 vlc_tbl1[1024];
     extern ui16 uvlc_tbl0[256+64];
     extern ui16 uvlc_tbl1[256];
-
+    extern ui8 uvlc_bias[256+64];
   } // !namespace local
 } // !namespace ojph
diff --git a/src/core/coding/ojph_block_decoder.h b/src/core/coding/ojph_block_decoder.h
index dcd3220b..a1970174 100644
--- a/src/core/coding/ojph_block_decoder.h
+++ b/src/core/coding/ojph_block_decoder.h
@@ -50,7 +50,12 @@ namespace ojph {
 
     // generic decoder
     bool
-      ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data,
+      ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data,
+        ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
+        ui32 width, ui32 height, ui32 stride, bool stripe_causal);
+
+    bool
+      ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data,
         ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
         ui32 width, ui32 height, ui32 stride, bool stripe_causal);
 
@@ -60,6 +65,12 @@ namespace ojph {
         ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
         ui32 width, ui32 height, ui32 stride, bool stripe_causal);
 
+    // AVX2-accelerated decoder
+    bool
+      ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data,
+        ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
+        ui32 width, ui32 height, ui32 stride, bool stripe_causal);
+
     // WASM SIMD-accelerated decoder
     bool
       ojph_decode_codeblock_wasm(ui8* coded_data, ui32* decoded_data,
diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder32.cpp
similarity index 97%
rename from src/core/coding/ojph_block_decoder.cpp
rename to src/core/coding/ojph_block_decoder32.cpp
index 8c287990..daf2312d 100644
--- a/src/core/coding/ojph_block_decoder.cpp
+++ b/src/core/coding/ojph_block_decoder32.cpp
@@ -64,7 +64,7 @@ namespace ojph {
       dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
         k(0), num_runs(0), runs(0)
       {}
-      // data decoding machinary
+      // data decoding machinery
       ui8* data;    //!<the address of data (or bitstream)
       ui64 tmp;     //!<temporary buffer for read data
       int bits;     //!<number of bits stored in tmp
@@ -505,7 +505,7 @@ namespace ojph {
      *         an architecture that read size must be compatible with the
      *         alignment of the read address
      *
-     *  There is another simiar subroutine rev_init.  This subroutine does 
+     *  There is another similar subroutine rev_init.  This subroutine does 
      *  NOT skip the first 12 bits, and starts with unstuff set to true.
      *
      *  @param [in]  mrp is a pointer to rev_struct structure
@@ -578,7 +578,7 @@ namespace ojph {
     /** @brief State structure for reading and unstuffing of forward-growing 
      *         bitstreams; these are: MagSgn and SPP bitstreams
      */
-    struct frwd_struct {
+    struct frwd_struct32 {
       const ui8* data;  //!<pointer to bitstream
       ui64 tmp;         //!<temporary buffer of read data
       ui32 bits;        //!<number of bits stored in tmp
@@ -601,12 +601,12 @@ namespace ojph {
      *  Reading can go beyond the end of buffer by up to 3 bytes.
      *
      *  @tparam       X is the value fed in when the bitstream is exhausted
-     *  @param  [in]  msp is a pointer to frwd_struct structure
+     *  @param  [in]  msp is a pointer to frwd_struct32 structure
      *
      */ 
     template<int X>
     static inline 
-    void frwd_read(frwd_struct *msp)
+    void frwd_read(frwd_struct32 *msp)
     {
       assert(msp->bits <= 32); // assert that there is a space for 32 bits
 
@@ -653,17 +653,17 @@ namespace ojph {
     }
 
     //************************************************************************/
-    /** @brief Initialize frwd_struct struct and reads some bytes
+    /** @brief Initialize frwd_struct32 struct and reads some bytes
      *  
      *  @tparam      X is the value fed in when the bitstream is exhausted.
      *               See frwd_read regarding the template
-     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  msp is a pointer to frwd_struct32
      *  @param [in]  data is a pointer to the start of data
      *  @param [in]  size is the number of byte in the bitstream
      */
     template<int X>
     static inline
-    void frwd_init(frwd_struct *msp, const ui8* data, int size)
+    void frwd_init(frwd_struct32 *msp, const ui8* data, int size)
     {
       msp->data = data;
       msp->tmp = 0;
@@ -689,13 +689,13 @@ namespace ojph {
     }
 
     //************************************************************************/
-    /** @brief Consume num_bits bits from the bitstream of frwd_struct
+    /** @brief Consume num_bits bits from the bitstream of frwd_struct32
      *
-     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  msp is a pointer to frwd_struct32
      *  @param [in]  num_bits is the number of bit to consume
      */
     static inline 
-    void frwd_advance(frwd_struct *msp, ui32 num_bits)
+    void frwd_advance(frwd_struct32 *msp, ui32 num_bits)
     {
       assert(num_bits <= msp->bits);
       msp->tmp >>= num_bits;  // consume num_bits
@@ -703,15 +703,15 @@ namespace ojph {
     }
 
     //************************************************************************/
-    /** @brief Fetches 32 bits from the frwd_struct bitstream
+    /** @brief Fetches 32 bits from the frwd_struct32 bitstream
      *
      *  @tparam      X is the value fed in when the bitstream is exhausted.
      *               See frwd_read regarding the template
-     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  msp is a pointer to frwd_struct32
      */
     template<int X>
     static inline 
-    ui32 frwd_fetch(frwd_struct *msp)
+    ui32 frwd_fetch(frwd_struct32 *msp)
     {
       if (msp->bits < 32)
       {
@@ -739,11 +739,11 @@ namespace ojph {
      *  @param [in]   stride is the decoded codeblock buffer stride 
      *  @param [in]   stripe_causal is true for stripe causal mode
      */
-    bool ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data,
-                               ui32 missing_msbs, ui32 num_passes,
-                               ui32 lengths1, ui32 lengths2,
-                               ui32 width, ui32 height, ui32 stride,
-                               bool stripe_causal)
+    bool ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data,
+                                 ui32 missing_msbs, ui32 num_passes,
+                                 ui32 lengths1, ui32 lengths2,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 bool stripe_causal)
     {
       static bool insufficient_precision = false;
       static bool modify_code = false;
@@ -753,14 +753,14 @@ namespace ojph {
       {
         OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
                               "one coding pass, but zero length for "
-                              "2nd and potential 3rd pass.\n");
+                              "2nd and potential 3rd pass.");
         num_passes = 1;
       }
 
       if (num_passes > 3)
       {
         OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
-                              "This codeblocks has %d passes.\n",
+                              "This codeblocks has %d passes.",
                               num_passes);
         return false;
       }
@@ -772,7 +772,7 @@ namespace ojph {
           insufficient_precision = true;
           OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
                                 "codeblock. This message will not be "
-                                "displayed again.\n");
+                                "displayed again.");
         }
         return false;
       }       
@@ -783,7 +783,7 @@ namespace ojph {
           OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
                                 "pass. The code can be modified to support "
                                 "this case. This message will not be "
-                                "displayed again.\n");
+                                "displayed again.");
         }
          return false;         // 32 bits are not enough to decode this
        }
@@ -796,7 +796,7 @@ namespace ojph {
             OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
                                   "nor MagRef passes; both will be skipped. "
                                   "This message will not be displayed "
-                                  "again.\n");
+                                  "again.");
           }
         }
       }
@@ -806,7 +806,7 @@ namespace ojph {
 
       if (lengths1 < 2)
       {
-        OJPH_WARN(0x00010006, "Wrong codeblock length.\n");
+        OJPH_WARN(0x00010006, "Wrong codeblock length.");
         return false;
       }
 
@@ -1079,7 +1079,7 @@ namespace ojph {
             // quad 0 length
             len = uvlc_entry & 0x7; // quad 0 suffix length
             uvlc_entry >>= 3;
-            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q
+            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
             sp[1] = u_q;
             u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
             sp[3] = u_q;
@@ -1099,7 +1099,7 @@ namespace ojph {
         const int v_n_size = 512 + 4;
         ui32 v_n_scratch[v_n_size] = {0};  // 2+ kB
 
-        frwd_struct magsgn;
+        frwd_struct32 magsgn;
         frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
 
         ui16 *sp = scratch;
@@ -1217,7 +1217,7 @@ namespace ojph {
 
             ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1?
             ui32 emax = vp[0] | vp[1];
-            emax = 31 - count_leading_zeros(emax | 2); // emax - 1            
+            emax = 31 - count_leading_zeros(emax | 2); // emax - 1
             ui32 kappa = gamma ? emax : 1;
 
             ui32 U_q = u_q + kappa;
@@ -1368,7 +1368,7 @@ namespace ojph {
           // We add an extra 8 entries, just in case we need more
           ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
 
-          frwd_struct sigprop;
+          frwd_struct32 sigprop;
           frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
 
           for (ui32 y = 0; y < height; y += 4)
@@ -1613,4 +1613,4 @@ namespace ojph {
       return true;
     }
   }
-}
+}
\ No newline at end of file
diff --git a/src/core/coding/ojph_block_decoder64.cpp b/src/core/coding/ojph_block_decoder64.cpp
new file mode 100644
index 00000000..bce5b9ec
--- /dev/null
+++ b/src/core/coding/ojph_block_decoder64.cpp
@@ -0,0 +1,1663 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019, The University of New South Wales, Australia
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_block_decoder.cpp
+// Author: Aous Naman
+// Date: 13 May 2022
+//***************************************************************************/
+
+//***************************************************************************/
+/** @file ojph_block_decoder.cpp
+ *  @brief implements a HTJ2K block decoder
+ */
+
+#include <string>
+#include <iostream>
+
+#include <cassert>
+#include <cstring>
+#include "ojph_block_common.h"
+#include "ojph_block_decoder.h"
+#include "ojph_arch.h"
+#include "ojph_message.h"
+
+namespace ojph {
+  namespace local {
+
+    //************************************************************************/
+    /** @brief MEL state structure for reading and decoding the MEL bitstream
+     *
+     *  A number of events is decoded from the MEL bitstream ahead of time
+     *  and stored in run/num_runs.
+     *  Each run represents the number of zero events before a one event.
+     */ 
+    struct dec_mel_st {
+      dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
+        k(0), num_runs(0), runs(0)
+      {}
+      // data decoding machinery
+      ui8* data;    //!<the address of data (or bitstream)
+      ui64 tmp;     //!<temporary buffer for read data
+      int bits;     //!<number of bits stored in tmp
+      int size;     //!<number of bytes in MEL code
+      bool unstuff; //!<true if the next bit needs to be unstuffed
+      int k;        //!<state of MEL decoder
+
+      // queue of decoded runs
+      int num_runs; //!<number of decoded runs left in runs (maximum 8)
+      ui64 runs;    //!<runs of decoded MEL codewords (7 bits/run)
+    };
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs the MEL bitstream
+     * 
+     *  This design needs more bytes in the codeblock buffer than the length
+     *  of the cleanup pass by up to 2 bytes.
+     *
+     *  Unstuffing removes the MSB of the byte following a byte whose
+     *  value is 0xFF; this prevents sequences larger than 0xFF7F in value
+     *  from appearing the bitstream.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_read(dec_mel_st *melp)
+    {
+      if (melp->bits > 32)  //there are enough bits in the tmp variable
+        return;             // return without reading new data
+
+      ui32 val = 0xFFFFFFFF;       // feed in 0xFF if buffer is exhausted
+      if (melp->size > 4) {        // if there is data in the MEL segment
+        val = *(ui32*)melp->data;  // read 32 bits from MEL data
+        melp->data += 4;           // advance pointer
+        melp->size -= 4;           // reduce counter
+      }
+      else if (melp->size > 0)
+      { // 4 or less
+        int i = 0;
+        while (melp->size > 1) {   
+          ui32 v = *melp->data++;    // read one byte at a time
+          ui32 m = ~(0xFFu << i);    // mask of location
+          val = (val & m) | (v << i);// put one byte in its correct location
+          --melp->size;
+          i += 8;
+        }
+        // size equal to 1
+        ui32 v = *melp->data++;    // the one before the last is different 
+        v |= 0xF;                  // MEL and VLC segments can overlap
+        ui32 m = ~(0xFFu << i);
+        val = (val & m) | (v << i);
+        --melp->size;
+      }
+      
+      // next we unstuff them before adding them to the buffer
+      int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
+                                     // the previously read byte requires 
+                                     // unstuffing
+
+      // data is unstuffed and accumulated in t
+      // bits has the number of bits in t
+      ui32 t = val & 0xFF; 
+      bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
+      bits -= unstuff; // there is one less bit in t if unstuffing is needed
+      t = t << (8 - unstuff); // move up to make room for the next byte
+
+      //this is a repeat of the above
+      t |= (val>>8) & 0xFF;
+      unstuff = (((val >> 8) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>16) & 0xFF;
+      unstuff = (((val >> 16) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>24) & 0xFF;
+      melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
+
+      // move t to tmp, and push the result all the way up, so we read from
+      // the MSB
+      melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
+      melp->bits += bits; //increment the number of bits in tmp
+    }
+
+    //************************************************************************/
+    /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs
+     * 
+     *  Runs are stored in "runs" and the number of runs in "num_runs".
+     *  Each run represents a number of zero events that may or may not 
+     *  terminate in a 1 event.
+     *  Each run is stored in 7 bits.  The LSB is 1 if the run terminates in
+     *  a 1 event, 0 otherwise.  The next 6 bits, for the case terminating 
+     *  with 1, contain the number of consecutive 0 zero events * 2; for the 
+     *  case terminating with 0, they store (number of consecutive 0 zero 
+     *  events - 1) * 2.
+     *  A total of 6 bits (made up of 1 + 5) should have been enough.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_decode(dec_mel_st *melp)
+    {
+      static const int mel_exp[13] = { //MEL exponents
+        0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
+      };
+
+      if (melp->bits < 6) // if there are less than 6 bits in tmp
+        mel_read(melp);   // then read from the MEL bitstream
+                          // 6 bits is the largest decodable MEL cwd
+
+      //repeat so long that there is enough decodable bits in tmp,
+      // and the runs store is not full (num_runs < 8)
+      while (melp->bits >= 6 && melp->num_runs < 8)
+      {
+        int eval = mel_exp[melp->k]; // number of bits associated with state
+        int run = 0;
+        if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
+        { //one is found
+          run = 1 << eval;  
+          run--; // consecutive runs of 0 events - 1
+          melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
+          melp->tmp <<= 1; // consume one bit from tmp
+          melp->bits -= 1;
+          run = run << 1; // a stretch of zeros not terminating in one
+        }
+        else
+        { //0 is found
+          run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
+          melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
+          melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
+          melp->bits -= eval + 1;
+          run = (run << 1) + 1; // a stretch of zeros terminating with one
+        }
+        eval = melp->num_runs * 7;           // 7 bits per run
+        melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
+        melp->runs |= ((ui64)run) << eval;   // store the value in runs
+        melp->num_runs++;                    // increment count  
+      }
+    }
+
+    //************************************************************************/
+    /** @brief Initiates a dec_mel_st structure for MEL decoding and reads
+     *         some bytes in order to get the read address to a multiple
+     *         of 4 
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     *  @param [in]  bbuf is a pointer to byte buffer
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline
+    void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
+    {
+      melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
+      melp->bits = 0;                  // 0 bits in tmp
+      melp->tmp = 0;                   //
+      melp->unstuff = false;           // no unstuffing
+      melp->size = scup - 1;           // size is the length of MEL+VLC-1
+      melp->k = 0;                     // 0 for state 
+      melp->num_runs = 0;              // num_runs is 0
+      melp->runs = 0;                  //
+
+      //This code is borrowed; original is for a different architecture
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MEL segment
+      int num = 4 - (int)(intptr_t(melp->data) & 0x3);
+      for (int i = 0; i < num; ++i) { // this code is similar to mel_read
+        assert(melp->unstuff == false || melp->data[0] <= 0x8F);
+        ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
+                                                       //set data to 0xFF
+        if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
+                                       // see the standard
+        melp->data += melp->size-- > 0; //increment if the end is not reached
+        int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
+        melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
+        melp->bits += d_bits;  //increment tmp by number of bits
+        melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs 
+                                              //unstuffing
+      }
+      melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
+                                       // is the MSB
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves one run from dec_mel_st; if there are no runs stored
+     *         MEL segment is decoded
+     *
+     * @param [in]  melp is a pointer to dec_mel_st structure
+     */    
+    static inline
+    int mel_get_run(dec_mel_st *melp)
+    {
+      if (melp->num_runs == 0)  //if no runs, decode more bit from MEL segment
+        mel_decode(melp);
+
+      int t = melp->runs & 0x7F; //retrieve one run
+      melp->runs >>= 7;  // remove the retrieved run
+      melp->num_runs--;
+      return t; // return run
+    }
+
+    //************************************************************************/
+    /** @brief A structure for reading and unstuffing a segment that grows
+     *         backward, such as VLC and MRP
+     */ 
+    struct rev_struct {
+      rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
+      {}
+      //storage
+      ui8* data;     //!<pointer to where to read data
+      ui64 tmp;	     //!<temporary buffer of read data
+      ui32 bits;     //!<number of bits stored in tmp
+      int size;      //!<number of bytes left
+      bool unstuff;  //!<true if the last byte is more than 0x8F
+                     //!<then the current byte is unstuffed if it is 0x7F
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuff data from a backwardly-growing segment
+     *
+     *  This reader reads 8 bits from the VLC segment. It fills zeros when 
+     *  the buffer is exhausted; we basically do not care about these zeros 
+     *  because we should not need them -- any extra data should not be used 
+     *  in the actual decoding. If these bytes are needed, then there is a 
+     *  problem in the bitstream, but we do not flag this error.
+     *
+     *  Unstuffing is needed to prevent sequences larger than 0xFF8F from 
+     *  appearing in the bits stream; since we are reading backward, we keep
+     *  watch when a value larger than 0x8F appears in the bitstream. 
+     *  If the byte following this is 0x7F, we unstuff this byte (ignore the 
+     *  MSB of that byte, which should be 0).
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline 
+    void rev_read8(rev_struct *vlcp)
+    {
+      // process 1 bytes
+      ui8 val = 0; // insert 0s at the end -- the standard says that the
+                   // bitstream must contain all needed bits. Therefore
+                   // if the whole bitstream is consumed and bits are still
+                   // needed, then this is an error condition, but we are
+                   // lenient -- it is also possible that we are decoding
+                   // more bits than what we are actually need.
+      if (vlcp->size > 0)  // if there are more than 3 bytes left in VLC
+      {
+        val = *vlcp->data; // then read 8 bits
+        --vlcp->data;      // increment data pointer
+        --vlcp->size;      // decrement number of bytes in the buffer
+      }
+
+      // accumulate in tmp, and increment bits, check if unstuffing is needed
+      ui8 t = (vlcp->unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0;
+      val = (ui8)(val & (0xFFU >> t)); // protect against erroneous 1 in MSB
+      vlcp->tmp |= (ui64)val << vlcp->bits;
+      vlcp->bits += 8 - t;
+      vlcp->unstuff = val > 0x8F;
+    }
+
+    //************************************************************************/
+    /** @brief Initiates the rev_struct structure and reads the first byte
+     *
+     *  This subroutine initializes the VLC decoder.  It discards the first 
+     *  12 bits (they have the sum of the lengths of VLC and MEL segments), 
+     *  and depending on unstuffing, stores 3 or 4 bits in the unstuffed
+     *  decoded buffer.
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline 
+    void rev_init8(rev_struct *vlcp, ui8* data, int lcup, int scup)
+    {
+      //first byte has only the upper 4 bits
+      vlcp->data = data + lcup - 2;
+
+      //size can not be larger than this, in fact it should be smaller
+      vlcp->size = scup - 2;
+
+      ui8 val = *vlcp->data--; // read one byte (this is a half byte)
+
+      // the first byte is treated different to other bytes, because only
+      // the MSB nibble is part of the VLC code.
+      val = (ui8)(val >> 4);
+      ui8 t = ((val & 0x7) == 0x7) ? 1 : 0; // unstuffing is needed
+      val = (ui8)(val & (0xFU >> t)); // protect against erroneous 1 in MSB
+      vlcp->tmp = val;
+      vlcp->bits = 4 - t;
+      vlcp->unstuff = val > 0x8; //this is useful for the next byte
+    }
+
+    //************************************************************************/
+    /** @brief Fills the temporary variable (vlcp->tmp) by up to 64 bits
+     *
+     *  By the end of this call, vlcp->tmp must have no less than 56 bits
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline 
+    ui64 rev_fetch64(rev_struct *vlcp)
+    {
+      while (vlcp->bits <= 56)
+        rev_read8(vlcp); // read 8 bits, but unstuffing might reduce this
+      return vlcp->tmp;  // return unstuff decoded bits
+    }    
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    static inline 
+    ui64 rev_advance64(rev_struct *vlcp, ui32 num_bits)
+    {
+      assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
+      vlcp->tmp >>= num_bits;         // remove bits
+      vlcp->bits -= num_bits;         // decrement the number of bits
+      return vlcp->tmp;
+    }    
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs from rev_struct
+     *
+     *  This is different than rev_read in that this fills in zeros when the
+     *  the available data is consumed.  The other does not care about the
+     *  values when all data is consumed.
+     *
+     *  See rev_read for more information about unstuffing
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline 
+    void rev_read_mrp(rev_struct *mrp)
+    {
+      //process 4 bytes at a time
+      if (mrp->bits > 32)
+        return;
+      ui32 val = 0;
+      if (mrp->size > 3) // If there are 3 byte or more
+      { // (mrp->data - 3) move pointer back to read 32 bits at once
+        val = *(ui32*)(mrp->data - 3); // read 32 bits
+        mrp->data -= 4;                // move back pointer
+        mrp->size -= 4;                // reduce count
+      }
+      else if (mrp->size > 0)
+      {
+        int i = 24;
+        while (mrp->size > 0) {   
+          ui32 v = *mrp->data--; // read one byte at a time
+          val |= (v << i);       // put byte in its correct location
+          --mrp->size;
+          i -= 8;
+        }
+      }
+
+      //accumulate in tmp, and keep count in bits
+      ui32 bits, tmp = val >> 24;
+
+      //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
+      bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
+      bool unstuff = (val >> 24) > 0x8F;
+
+      //process the next byte
+      tmp |= ((val >> 16) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 16) & 0xFF) > 0x8F;
+
+      tmp |= ((val >> 8) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 8) & 0xFF) > 0x8F;
+
+      tmp |= (val & 0xFF) << bits;
+      bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = (val & 0xFF) > 0x8F;
+
+      mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
+      mrp->bits += bits;
+      mrp->unstuff = unstuff;             // next byte
+    }
+
+    //************************************************************************/
+    /** @brief Initialized rev_struct structure for MRP segment, and reads
+     *         a number of bytes such that the next 32 bits read are from
+     *         an address that is a multiple of 4. Note this is designed for
+     *         an architecture that read size must be compatible with the
+     *         alignment of the read address
+     *
+     *  There is another similar subroutine rev_init.  This subroutine does 
+     *  NOT skip the first 12 bits, and starts with unstuff set to true.
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  len2 is the length of SPP+MRP segments
+     */
+    static inline 
+    void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
+    {
+      mrp->data = data + lcup + len2 - 1;
+      mrp->size = len2;
+      mrp->unstuff = true;
+      mrp->bits = 0;
+      mrp->tmp = 0;
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MRP stream
+      int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
+      for (int i = 0; i < num; ++i) {
+        ui64 d;
+        //read a byte, 0 if no more data
+        d = (mrp->size-- > 0) ? *mrp->data-- : 0; 
+        //check if unstuffing is needed
+        ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
+        mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
+        mrp->bits += d_bits;
+        mrp->unstuff = d > 0x8F; // for next byte
+      }
+      rev_read_mrp(mrp);
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves 32 bits from the head of a rev_struct structure 
+     *
+     *  By the end of this call, mrp->tmp must have no less than 33 bits
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline 
+    ui32 rev_fetch_mrp(rev_struct *mrp)
+    {
+      if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
+      {
+        rev_read_mrp(mrp);    // read 30-32 bits from mrp
+        if (mrp->bits < 32)   // if there is a space of 32 bits
+          rev_read_mrp(mrp);  // read more
+      }
+      return (ui32)mrp->tmp;  // return the head of mrp->tmp
+    }
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    static inline 
+    ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
+    {
+      assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
+      mrp->tmp >>= num_bits;  // discard the lowest num_bits bits
+      mrp->bits -= num_bits;
+      return (ui32)mrp->tmp;  // return data after consumption
+    }
+
+    //************************************************************************/
+    /** @brief State structure for reading and unstuffing of forward-growing 
+     *         bitstreams; these are: MagSgn and SPP bitstreams
+     */
+    struct frwd_struct64 {
+      const ui8* data;  //!<pointer to bitstream
+      ui64 tmp;         //!<temporary buffer of read data
+      ui32 bits;        //!<number of bits stored in tmp
+      ui32 unstuff;     //!<1 if a bit needs to be unstuffed from next byte
+      int size;         //!<size of data
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuffs 32 bits from forward-growing bitstream
+     *  
+     *  A template is used to accommodate a different requirement for
+     *  MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is
+     *  consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in.
+     *  X controls this value.
+     *
+     *  Unstuffing prevent sequences that are more than 0xFF7F from appearing
+     *  in the conpressed sequence.  So whenever a value of 0xFF is coded, the
+     *  MSB of the next byte is set 0 and must be ignored during decoding.
+     *
+     *  Reading can go beyond the end of buffer by up to 3 bytes.
+     *
+     *  @tparam       X is the value fed in when the bitstream is exhausted
+     *  @param  [in]  msp is a pointer to frwd_struct64 structure
+     *
+     */ 
+    template<int X>
+    static inline 
+    void frwd_read(frwd_struct64 *msp)
+    {
+      assert(msp->bits <= 32); // assert that there is a space for 32 bits
+
+      ui32 val = 0;
+      if (msp->size > 3) {
+        val = *(ui32*)msp->data;  // read 32 bits
+        msp->data += 4;           // increment pointer
+        msp->size -= 4;           // reduce size
+      }
+      else if (msp->size > 0)
+      {
+        int i = 0;
+        val = X != 0 ? 0xFFFFFFFFu : 0;
+        while (msp->size > 0) {   
+          ui32 v = *msp->data++;    // read one byte at a time
+          ui32 m = ~(0xFFu << i);    // mask of location
+          val = (val & m) | (v << i);// put one byte in its correct location
+          --msp->size;
+          i += 8;          
+        }
+      }
+      else
+        val = X != 0 ? 0xFFFFFFFFu : 0;
+
+      // we accumulate in t and keep a count of the number of bits in bits
+      ui32 bits = 8 - msp->unstuff;        
+      ui32 t = val & 0xFF;
+      bool unstuff = ((val & 0xFF) == 0xFF);  // Do we need unstuffing next?
+
+      t |= ((val >> 8) & 0xFF) << bits;
+      bits += 8 - unstuff;
+      unstuff = (((val >> 8) & 0xFF) == 0xFF);
+
+      t |= ((val >> 16) & 0xFF) << bits;
+      bits += 8 - unstuff;
+      unstuff = (((val >> 16) & 0xFF) == 0xFF);
+
+      t |= ((val >> 24) & 0xFF) << bits;
+      bits += 8 - unstuff;
+      msp->unstuff = (((val >> 24) & 0xFF) == 0xFF); // for next byte
+
+      msp->tmp |= ((ui64)t) << msp->bits;  // move data to msp->tmp
+      msp->bits += bits;
+    }
+
+    //************************************************************************/
+    /** @brief Read and unstuffs 8 bits from forward-growing bitstream
+     *  
+     *  A template is used to accommodate a different requirement for
+     *  MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is
+     *  consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in.
+     *  X controls this value.
+     *
+     *  Unstuffing prevent sequences that are more than 0xFF7F from appearing
+     *  in the conpressed sequence.  So whenever a value of 0xFF is coded, the
+     *  MSB of the next byte is set 0 and must be ignored during decoding.
+     *
+     *  @tparam       X is the value fed in when the bitstream is exhausted
+     *  @param  [in]  msp is a pointer to frwd_struct64 structure
+     *
+     */ 
+    template<ui8 X>
+    static inline 
+    void frwd_read8(frwd_struct64 *msp)
+    {
+      ui8 val = X;
+      if (msp->size > 0) {
+        val = *msp->data;  // read 8 bits
+        ++msp->data;      // increment pointer
+        --msp->size;      // reduce size
+      }
+
+      // unstuff and accumulate
+      ui8 t = msp->unstuff ? 1 : 0;
+      val = (ui8)(val & (0xFFU >> t));
+      msp->unstuff = (val == 0xFF);
+      msp->tmp |= ((ui64)val) << msp->bits;  // move data to msp->tmp
+      msp->bits += 8 - t;
+    }
+
+    //************************************************************************/
+    /** @brief Initialize frwd_struct64 struct and reads some bytes
+     *  
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct64
+     *  @param [in]  data is a pointer to the start of data
+     *  @param [in]  size is the number of byte in the bitstream
+     */
+    template<int X>
+    static inline
+    void frwd_init(frwd_struct64 *msp, const ui8* data, int size)
+    {
+      msp->data = data;
+      msp->tmp = 0;
+      msp->bits = 0;
+      msp->unstuff = 0;
+      msp->size = size;
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the bitstream
+      int num = 4 - (int)(intptr_t(msp->data) & 0x3);
+      for (int i = 0; i < num; ++i)
+      {
+        ui64 d;
+        //read a byte if the buffer is not exhausted, otherwise set it to X
+        d = msp->size-- > 0 ? *msp->data++ : X;
+        msp->tmp |= (d << msp->bits);      // store data in msp->tmp
+        msp->bits += 8 - msp->unstuff;     // number of bits added to msp->tmp
+        msp->unstuff = ((d & 0xFF) == 0xFF); // unstuffing for next byte
+      }
+      frwd_read<X>(msp); // read 32 bits more
+    }
+
+    //************************************************************************/
+    /** @brief Initialize frwd_struct64 struct and reads some bytes
+     *  
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct64
+     *  @param [in]  data is a pointer to the start of data
+     *  @param [in]  size is the number of byte in the bitstream
+     */
+    template<ui8 X>
+    static inline
+    void frwd_init8(frwd_struct64 *msp, const ui8* data, int size)
+    {
+      msp->data = data;
+      msp->tmp = 0;
+      msp->bits = 0;
+      msp->unstuff = 0;
+      msp->size = size;
+      frwd_read8<X>(msp); // read 8 bits
+    }
+
+    //************************************************************************/
+    /** @brief Consume num_bits bits from the bitstream of frwd_struct64
+     *
+     *  @param [in]  msp is a pointer to frwd_struct64
+     *  @param [in]  num_bits is the number of bit to consume
+     */
+    static inline 
+    void frwd_advance(frwd_struct64 *msp, ui32 num_bits)
+    {
+      assert(num_bits <= msp->bits);
+      msp->tmp >>= num_bits;  // consume num_bits
+      msp->bits -= num_bits;
+    }
+
+    //************************************************************************/
+    /** @brief Fetches 32 bits from the frwd_struct64 bitstream
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct64
+     */
+    template<int X>
+    static inline 
+    ui32 frwd_fetch(frwd_struct64 *msp)
+    {
+      if (msp->bits < 32)
+      {
+        frwd_read<X>(msp);
+        if (msp->bits < 32) //need to test
+          frwd_read<X>(msp);
+      }
+      return (ui32)msp->tmp;
+    }
+
+    //************************************************************************/
+    /** @brief Fetches up to 64 bits from the frwd_struct64 bitstream
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct64
+     */
+    template<ui8 X>
+    static inline 
+    ui64 frwd_fetch64(frwd_struct64 *msp)
+    {
+      while (msp->bits <= 56)
+        frwd_read8<X>(msp);
+      return msp->tmp;
+    }    
+
+    //************************************************************************/
+    /** @brief Decodes one codeblock, processing the cleanup, siginificance
+     *         propagation, and magnitude refinement pass
+     *
+     *  @param [in]   coded_data is a pointer to bitstream
+     *  @param [in]   decoded_data is a pointer to decoded codeblock data buf.
+     *  @param [in]   missing_msbs is the number of missing MSBs
+     *  @param [in]   num_passes is the number of passes: 1 if CUP only,
+     *                2 for CUP+SPP, and 3 for CUP+SPP+MRP
+     *  @param [in]   lengths1 is the length of cleanup pass
+     *  @param [in]   lengths2 is the length of refinement passes (either SPP
+     *                only or SPP+MRP)
+     *  @param [in]   width is the decoded codeblock width 
+     *  @param [in]   height is the decoded codeblock height
+     *  @param [in]   stride is the decoded codeblock buffer stride 
+     *  @param [in]   stripe_causal is true for stripe causal mode
+     */
+    bool ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data,
+                                 ui32 missing_msbs, ui32 num_passes,
+                                 ui32 lengths1, ui32 lengths2,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 bool stripe_causal)
+    {
+      // static bool insufficient_precision = false;
+      // static bool modify_code = false;
+      // static bool truncate_spp_mrp = false;
+
+      if (num_passes > 1 && lengths2 == 0)
+      {
+        OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
+                              "one coding pass, but zero length for "
+                              "2nd and potential 3rd pass.");
+        num_passes = 1;
+      }
+
+      if (num_passes > 3)
+      {
+        OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
+                              "This codeblocks has %d passes.",
+                              num_passes);
+        return false;
+      }
+
+      // if (missing_msbs > 30) // p < 0
+      // {
+      //   if (insufficient_precision == false) 
+      //   {
+      //     insufficient_precision = true;
+      //     OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
+      //                           "codeblock. This message will not be "
+      //                           "displayed again.");
+      //   }
+      //   return false;
+      // }       
+      // else if (missing_msbs == 30) // p == 0
+      // { // not enough precision to decode and set the bin center to 1
+      //   if (modify_code == false) {
+      //     modify_code = true;
+      //     OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
+      //                           "pass. The code can be modified to support "
+      //                           "this case. This message will not be "
+      //                           "displayed again.");
+      //   }
+      //    return false;         // 32 bits are not enough to decode this
+      //  }
+      // else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
+      // {
+      //   if (num_passes > 1) {
+      //     num_passes = 1;
+      //     if (truncate_spp_mrp == false) {
+      //       truncate_spp_mrp = true;
+      //       OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
+      //                             "nor MagRef passes; both will be skipped. "
+      //                             "This message will not be displayed "
+      //                             "again.");
+      //     }
+      //   }
+      // }
+      ui32 p = 62 - missing_msbs; // The least significant bitplane for CUP
+      // There is a way to handle the case of p == 0, but a different path
+      // is required
+
+      if (lengths1 < 2)
+      {
+        OJPH_WARN(0x00010006, "Wrong codeblock length.");
+        return false;
+      }
+
+      // read scup and fix the bytes there
+      int lcup, scup;
+      lcup = (int)lengths1;  // length of CUP
+      //scup is the length of MEL + VLC
+      scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
+      if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
+        return false;
+
+      // The temporary storage scratch holds two types of data in an 
+      // interleaved fashion. The interleaving allows us to use one
+      // memory pointer.
+      // We have one entry for a decoded VLC code, and one entry for UVLC.
+      // Entries are 16 bits each, corresponding to one quad, 
+      // but since we want to use XMM registers of the SSE family 
+      // of SIMD; we allocated 16 bytes or more per quad row; that is,
+      // the width is no smaller than 16 bytes (or 8 entries), and the
+      // height is 512 quads
+      // Each VLC entry contains, in the following order, starting 
+      // from MSB
+      // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
+      // Each entry in UVLC contains u_q
+      // One extra row to handle the case of SPP propagating downwards
+      // when codeblock width is 4
+      ui16 scratch[8 * 513] = {0};       // 8 kB
+
+      // We need an extra two entries (one inf and one u_q) beyond
+      // the last column. 
+      // If the block width is 4 (2 quads), then we use sstr of 8 
+      // (enough for 4 quads). If width is 8 (4 quads) we use 
+      // sstr is 16 (enough for 8 quads). For a width of 16 (8 
+      // quads), we use 24 (enough for 12 quads).
+      ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
+
+      ui32 mmsbp2 = missing_msbs + 2;
+
+      // The cleanup pass is decoded in two steps; in step one,
+      // the VLC and MEL segments are decoded, generating a record that 
+      // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
+      // This information should be sufficient for the next step.
+      // In step 2, we decode the MagSgn segment.
+
+      // step 1 decoding VLC and MEL segments
+      {
+        // init structures
+        dec_mel_st mel;
+        mel_init(&mel, coded_data, lcup, scup);
+        rev_struct vlc;
+        rev_init8(&vlc, coded_data, lcup, scup);
+
+        int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
+                                     // data represented as runs of 0 events
+                                     // See mel_decode description
+
+        ui64 vlc_val;
+        ui32 c_q = 0;
+        ui16 *sp = scratch;
+        //initial quad row
+        for (ui32 x = 0; x < width; sp += 4)
+        {
+          // decode VLC
+          /////////////
+
+          // first quad
+          vlc_val = rev_fetch64(&vlc);
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
+
+          // if context is zero, use one MEL event
+          if (c_q == 0) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // Is the run terminated in 1? if so, use decoded VLC code, 
+            // otherwise, discard decoded data, since we will decoded again 
+            // using a different context
+            t0 = (run == -1) ? t0 : 0;
+
+            // is run -1 or -2? this means a run has been consumed
+            if (run < 0) 
+              run = mel_get_run(&mel);  // get another run
+          }
+          //run -= (c_q == 0) ? 2 : 0;
+          //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[0] = t0;
+          x += 2;
+
+          // prepare context for the next quad; eqn. 1 in ITU T.814
+          c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
+
+          //remove data from vlc stream (0 bits are removed if vlc is not used)
+          vlc_val = rev_advance64(&vlc, t0 & 0x7);
+
+          //second quad
+          ui16 t1 = 0;
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; 
+
+          // if context is zero, use one MEL event
+          if (c_q == 0 && x < width) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // if event is 0, discard decoded t1
+            t1 = (run == -1) ? t1 : 0;
+
+            if (run < 0) // have we consumed all events in a run
+              run = mel_get_run(&mel); // if yes, then get another run
+          }
+          t1 = x < width ? t1 : 0;
+          //run -= (c_q == 0 && x < width) ? 2 : 0;
+          //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[2] = t1;
+          x += 2;
+
+          //prepare context for the next quad, eqn. 1 in ITU T.814
+          c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
+
+          //remove data from vlc stream, if qinf is not used, cwdlen is 0
+          vlc_val = rev_advance64(&vlc, t1 & 0x7);
+          
+          // decode u
+          /////////////
+          // uvlc_mode is made up of u_offset bits from the quad pair
+          ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+          if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
+          {                     // the MEL run of events
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
+                                                 // is 0x40
+
+            if (run < 0)//if run is consumed (run is -1 or -2), get another run
+              run = mel_get_run(&mel);
+          }
+          //run -= (uvlc_mode == 0xc0) ? 2 : 0;
+          //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+
+          //decode uvlc_mode to get u for both quads
+          ui32 idx = uvlc_mode + (ui32)(vlc_val & 0x3F);
+          ui32 uvlc_entry = uvlc_tbl0[idx];
+          ui16 u_bias = uvlc_bias[idx];          
+          //remove total prefix length
+          vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); 
+          uvlc_entry >>= 3; 
+          //extract suffixes for quad 0 and 1
+          ui32 len = uvlc_entry & 0xF;             // suffix length for 2 quads
+          ui32 tmp = (ui32)(vlc_val&((1<<len)-1)); // suffix value for 2 quads
+          vlc_val = rev_advance64(&vlc, len);
+          uvlc_entry >>= 4;
+          // quad 0 length
+          len = uvlc_entry & 0x7; // quad 0 suffix length
+          uvlc_entry >>= 3;
+          ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
+          ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len));
+
+          // decode u_q extensions, which is needed only when u_q > 32
+          ui16 u_ext; bool cond0, cond1;
+          cond0 = u_q0 - (u_bias & 0x3) > 32;
+          u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0);
+          vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0);
+          u_q0 = (ui16)(u_q0 + (u_ext << 2));
+          sp[1] = (ui16)(u_q0 + 1); // kappa = 1
+          cond1 = u_q1 - (u_bias >> 2) > 32;
+          u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0);
+          vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0);
+          u_q1 = (ui16)(u_q1 + (u_ext << 2));
+          sp[3] = (ui16)(u_q1 + 1); // kappa = 1
+        }
+        sp[0] = sp[1] = 0;
+
+        //non initial quad rows
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          c_q = 0;                                // context
+          ui16 *sp = scratch + (y >> 1) * sstr;   // this row of quads
+
+          for (ui32 x = 0; x < width; sp += 4)
+          {
+            // decode VLC
+            /////////////
+
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
+
+            // first quad
+            vlc_val = rev_fetch64(&vlc);
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
+
+            // if context is zero, use one MEL event
+            if (c_q == 0) //zero context
+            {
+              run -= 2; //subtract 2, since events number is multiplied by 2
+
+              // Is the run terminated in 1? if so, use decoded VLC code, 
+              // otherwise, discard decoded data, since we will decoded again 
+              // using a different context
+              t0 = (run == -1) ? t0 : 0;
+
+              // is run -1 or -2? this means a run has been consumed
+              if (run < 0) 
+                run = mel_get_run(&mel);  // get another run
+            }
+            //run -= (c_q == 0) ? 2 : 0;
+            //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[0] = t0;
+            x += 2;
+
+            // prepare context for the next quad; eqn. 2 in ITU T.814
+            // sigma_q (w, sw)
+            c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[0 - (si32)sstr] & 0x80;
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
+
+            //remove data from vlc stream (0 bits are removed if vlc is unused)
+            vlc_val = rev_advance64(&vlc, t0 & 0x7);
+
+            //second quad
+            ui16 t1 = 0;
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; 
+
+            // if context is zero, use one MEL event
+            if (c_q == 0 && x < width) //zero context
+            {
+              run -= 2; //subtract 2, since events number if multiplied by 2
+
+              // if event is 0, discard decoded t1
+              t1 = (run == -1) ? t1 : 0;
+
+              if (run < 0) // have we consumed all events in a run
+                run = mel_get_run(&mel); // if yes, then get another run
+            }
+            t1 = x < width ? t1 : 0;
+            //run -= (c_q == 0 && x < width) ? 2 : 0;
+            //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[2] = t1;
+            x += 2;
+
+            // partial c_q, will be completed when we process the next quad
+            // sigma_q (w, sw)
+            c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[2 - (si32)sstr] & 0x80;
+
+            //remove data from vlc stream, if qinf is not used, cwdlen is 0
+            vlc_val = rev_advance64(&vlc, t1 & 0x7);
+          
+            // decode u
+            /////////////
+            // uvlc_mode is made up of u_offset bits from the quad pair
+            ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+            ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
+            //remove total prefix length
+            vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7);
+            uvlc_entry >>= 3;
+            //extract suffixes for quad 0 and 1
+            ui32 len = uvlc_entry & 0xF;             //suffix length for 2 quads
+            ui32 tmp = (ui32)(vlc_val&((1<<len)-1)); //suffix value for 2 quads
+            vlc_val = rev_advance64(&vlc, len);
+            uvlc_entry >>= 4;
+            // quad 0 length
+            len = uvlc_entry & 0x7; // quad 0 suffix length
+            uvlc_entry >>= 3;
+            ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
+            ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
+
+            // decode u_q extensions, which is needed only when u_q > 32
+            ui16 u_ext; bool cond0, cond1;
+            cond0 = u_q0 > 32;
+            u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0);
+            vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0);
+            u_q0 = (ui16)(u_q0 + (u_ext << 2));
+            sp[1] = u_q0;
+            cond1 = u_q1 > 32;
+            u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0);
+            vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0);
+            u_q1 = (ui16)(u_q1 + (u_ext << 2));
+            sp[3] = u_q1;
+          }
+          sp[0] = sp[1] = 0;
+        }
+      }
+
+      // step2 we decode magsgn
+      {
+        // We allocate a scratch row for storing v_n values.
+        // We have 512 quads horizontally.
+        // We need an extra entry to handle the case of vp[1]
+        // when vp is at the last column.
+        // Here, we allocate 4 instead of 1 to make the buffer size
+        // a multipled of 16 bytes.
+        const int v_n_size = 512 + 4;
+        ui64 v_n_scratch[v_n_size] = {0};  // 4+ kB
+
+        frwd_struct64 magsgn;
+        frwd_init8<0xFF>(&magsgn, coded_data, lcup - scup);
+
+        const ui16 *sp = scratch;
+        ui64 *vp = v_n_scratch;
+        ui64 *dp = decoded_data;
+
+        ui64 prev_v_n = 0;
+        for (ui32 x = 0; x < width; sp += 2, ++vp)
+        {
+          ui32 inf = sp[0];
+          ui32 U_q = sp[1];
+          if (U_q > mmsbp2)
+            return false;
+
+          ui64 v_n;
+          ui64 val = 0;
+          ui32 bit = 0;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[0] = val;
+
+          v_n = 0;
+          val = 0;
+          bit = 1;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[stride] = val;
+          vp[0] = prev_v_n | v_n;
+          prev_v_n = 0;
+          ++dp;
+          if (++x >= width)
+          { ++vp; break; }
+
+          val = 0;
+          bit = 2;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[0] = val;
+
+          v_n = 0;
+          val = 0;
+          bit = 3;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[stride] = val;
+          prev_v_n = v_n;
+          ++dp;
+          ++x;
+        }
+        vp[0] = prev_v_n;
+
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          const ui16 *sp = scratch + (y >> 1) * sstr;
+          ui64 *vp = v_n_scratch;
+          ui64 *dp = decoded_data + y * stride;
+
+          prev_v_n = 0;
+          for (ui32 x = 0; x < width; sp += 2, ++vp)
+          {
+            ui32 inf = sp[0];
+            ui32 u_q = sp[1];
+
+            ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1?
+            ui32 emax = 63 - count_leading_zeros(2 | vp[0] | vp[1]); // emax-1
+            ui32 kappa = gamma ? emax : 1;
+
+            ui32 U_q = u_q + kappa;
+            if (U_q > mmsbp2)
+              return false;
+
+            ui64 v_n;
+            ui64 val = 0;
+            ui32 bit = 0;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[0] = val;
+
+            v_n = 0;
+            val = 0;
+            bit = 1;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[stride] = val;
+            vp[0] = prev_v_n | v_n;
+            prev_v_n = 0;
+            ++dp;
+            if (++x >= width)
+            { ++vp; break; }
+
+            val = 0;
+            bit = 2;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[0] = val;
+
+            v_n = 0;
+            val = 0;
+            bit = 3;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[stride] = val;
+            prev_v_n = v_n;
+            ++dp;
+            ++x;
+          }
+          vp[0] = prev_v_n;
+        }
+      }
+
+      if (num_passes > 1)
+      {
+        // We use scratch again, we can divide it into multiple regions
+        // sigma holds all the significant samples, and it cannot
+        // be modified after it is set.  it will be used during the
+        // Magnitude Refinement Pass
+        ui16* const sigma = scratch;
+
+        ui32 mstr = (width + 3u) >> 2;   // divide by 4, since each
+                                         // ui16 contains 4 columns
+        mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
+
+        // We re-arrange quad significance, where each 4 consecutive
+        // bits represent one quad, into column significance, where,
+        // each 4 consequtive bits represent one column of 4 rows
+        {
+          ui32 y;
+          for (y = 0; y < height; y += 4)
+          {
+            ui16* sp = scratch + (y >> 1) * sstr;
+            ui16* dp = sigma + (y >> 2) * mstr;
+            for (ui32 x = 0; x < width; x += 4, sp += 4, ++dp) {
+              ui32 t0 = 0, t1 = 0;
+              t0  = ((sp[0     ] & 0x30u) >> 4)  | ((sp[0     ] & 0xC0u) >> 2);
+              t0 |= ((sp[2     ] & 0x30u) << 4)  | ((sp[2     ] & 0xC0u) << 6);
+              t1  = ((sp[0+sstr] & 0x30u) >> 2)  | ((sp[0+sstr] & 0xC0u)     );
+              t1 |= ((sp[2+sstr] & 0x30u) << 6)  | ((sp[2+sstr] & 0xC0u) << 8);
+              dp[0] = (ui16)(t0 | t1);
+            }
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+          {
+            // reset one row after the codeblock
+            ui16* dp = sigma + (y >> 2) * mstr;
+            for (ui32 x = 0; x < width; x += 4, ++dp)
+              dp[0] = 0;
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+        }
+
+        // We perform Significance Propagation Pass here
+        {
+          // This stores significance information of the previous
+          // 4 rows.  Significance information in this array includes
+          // all signicant samples in bitplane p - 1; that is,
+          // significant samples for bitplane p (discovered during the
+          // cleanup pass and stored in sigma) and samples that have recently
+          // became significant (during the SPP) in bitplane p-1.
+          // We store enough for the widest row, containing 1024 columns,
+          // which is equivalent to 256 of ui16, since each stores 4 columns.
+          // We add an extra 8 entries, just in case we need more
+          ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
+
+          frwd_struct64 sigprop;
+          frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui32 pattern = 0xFFFFu; // a pattern needed samples
+            if (height - y < 4) {
+              pattern = 0x7777u;
+              if (height - y < 3) {
+                pattern = 0x3333u;
+                if (height - y < 2)
+                  pattern = 0x1111u;
+              }
+            }
+
+            // prev holds sign. info. for the previous quad, together
+            // with the rows on top of it and below it.
+            ui32 prev = 0;
+            ui16 *prev_sig = prev_row_sig;
+            ui16 *cur_sig = sigma + (y >> 2) * mstr;
+            ui64 *dpp = decoded_data + y * stride;
+            for (ui32 x = 0; x < width; x += 4, ++cur_sig, ++prev_sig)
+            {
+              // only rows and columns inside the stripe are included
+              si32 s = (si32)x + 4 - (si32)width;
+              s = ojph_max(s, 0);
+              pattern = pattern >> (s * 4);
+
+              // We first find locations that need to be tested (potential
+              // SPP members); these location will end up in mbr
+              // In each iteration, we produce 16 bits because cwd can have
+              // up to 16 bits of significance information, followed by the
+              // corresponding 16 bits of sign information; therefore, it is
+              // sufficient to fetch 32 bit data per loop.
+
+              // Althougth we are interested in 16 bits only, we load 32 bits.
+              // For the 16 bits we are producing, we need the next 4 bits --
+              // We need data for at least 5 columns out of 8.
+              // Therefore loading 32 bits is easier than loading 16 bits
+              // twice.
+              ui32 ps = *(ui32*)prev_sig;
+              ui32 ns = *(ui32*)(cur_sig + mstr);
+              ui32 u = (ps & 0x88888888) >> 3; // the row on top
+              if (!stripe_causal)
+                u |= (ns & 0x11111111) << 3;   // the row below
+
+              ui32 cs = *(ui32*)cur_sig;
+              // vertical integration
+              ui32 mbr =  cs;                // this sig. info.
+              mbr |= (cs & 0x77777777) << 1; //above neighbors
+              mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
+              mbr |= u;
+              // horizontal integration
+              ui32 t = mbr;
+              mbr |= t << 4;      // neighbors on the left
+              mbr |= t >> 4;      // neighbors on the right
+              mbr |= prev >> 12;  // significance of previous group
+
+              // remove outside samples, and already significant samples
+              mbr &= pattern;
+              mbr &= ~cs;
+
+              // find samples that become significant during the SPP
+              ui32 new_sig = mbr;
+              if (new_sig)
+              {
+                ui64 cwd = frwd_fetch<0>(&sigprop);
+
+                ui32 cnt = 0;
+                ui32 col_mask = 0xFu;
+                ui32 inv_sig = ~cs & pattern;
+                for (int i = 0; i < 16; i += 4, col_mask <<= 4)
+                {
+                  if ((col_mask & new_sig) == 0)
+                    continue;
+
+                  //scan one column
+                  ui32 sample_mask = 0x1111u & col_mask;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x33u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x76u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xECu << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xC8u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+                }
+
+                if (new_sig)
+                {
+                  // new_sig has newly-discovered sig. samples during SPP
+                  // find the signs and update decoded_data
+                  ui64 *dp = dpp + x;
+                  ui64 val = 3u << (p - 2);
+                  col_mask = 0xFu;
+                  for (int i = 0; i < 4; ++i, ++dp, col_mask <<= 4)
+                  {
+                    if ((col_mask & new_sig) == 0)
+                      continue;
+
+                    //scan 4 signs
+                    ui32 sample_mask = 0x1111u & col_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[0] == 0);
+                      dp[0] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+
+                    sample_mask += sample_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[stride] == 0);
+                      dp[stride] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+
+                    sample_mask += sample_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[2 * stride] == 0);
+                      dp[2 * stride] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+
+                    sample_mask += sample_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[3 * stride] == 0);
+                      dp[3 * stride] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+                  }
+                }
+                frwd_advance(&sigprop, cnt);
+              }
+
+              new_sig |= cs;
+              *prev_sig = (ui16)(new_sig);
+
+              // vertical integration for the new sig. info.
+              t = new_sig;
+              new_sig |= (t & 0x7777) << 1; //above neighbors
+              new_sig |= (t & 0xEEEE) >> 1; //below neighbors
+              // add sig. info. from the row on top and below
+              prev = new_sig | u;
+              // we need only the bits in 0xF000
+              prev &= 0xF000;
+            }
+          }
+        }
+
+        // We perform Magnitude Refinement Pass here
+        if (num_passes > 2)
+        {
+          rev_struct magref;
+          rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui32 *cur_sig = (ui32*)(sigma + (y >> 2) * mstr);
+            ui64 *dpp = decoded_data + y * stride;
+            ui64 half = 1ULL << (p - 2);
+            for (ui32 i = 0; i < width; i += 8)
+            {
+              //Process one entry from sigma array at a time
+              // Each nibble (4 bits) in the sigma array represents 4 rows,
+              // and the 32 bits contain 8 columns
+              ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
+              ui32 sig = *cur_sig++; // 32 bit that will be processed now
+              ui32 col_mask = 0xFu;  // a mask for a column in sig
+              if (sig) // if any of the 32 bits are set
+              {
+                for (int j = 0; j < 8; ++j) //one column at a time
+                {
+                  if (sig & col_mask) // lowest nibble
+                  {
+                    ui64 *dp = dpp + i + j; // next column in decoded samples
+                    ui32 sample_mask = 0x11111111u & col_mask; //LSB
+
+                    for (int k = 0; k < 4; ++k) {
+                      if (sig & sample_mask) //if LSB is set
+                      {
+                        assert(dp[0] != 0); // decoded value cannot be zero
+                        assert((dp[0] & half) == 0); // no half
+                        ui64 sym = cwd & 1;          // get it value
+                        sym = (1 - sym) << (p - 1); // previous center of bin
+                        sym |= half;            // put half the center of bin
+                        dp[0] ^= sym;    // remove old bin center and put new
+                        cwd >>= 1;       // consume word
+                      }
+                      sample_mask += sample_mask; //next row
+                      dp += stride; // next samples row
+                    }
+                  }
+                  col_mask <<= 4; //next column
+                }
+              }
+              // consume data according to the number of bits set
+              rev_advance_mrp(&magref, population_count(sig));
+            }
+          }
+        }
+      }
+      return true;
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/core/coding/ojph_block_decoder_avx2.cpp b/src/core/coding/ojph_block_decoder_avx2.cpp
new file mode 100644
index 00000000..cccb4fd8
--- /dev/null
+++ b/src/core/coding/ojph_block_decoder_avx2.cpp
@@ -0,0 +1,2044 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2022, Aous Naman
+// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2022, The University of New South Wales, Australia
+// Copyright (c) 2024, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_block_decoder_avx2.cpp
+//***************************************************************************/
+
+//***************************************************************************/
+/** @file ojph_block_decoder_avx2.cpp
+ *  @brief implements a faster HTJ2K block decoder using avx2
+ */
+
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
+#include <string>
+#include <iostream>
+
+#include <cassert>
+#include <cstring>
+#include "ojph_block_common.h"
+#include "ojph_block_decoder.h"
+#include "ojph_message.h"
+
+#include <immintrin.h>
+
+namespace ojph {
+  namespace local {
+
+    //************************************************************************/
+    /** @brief MEL state structure for reading and decoding the MEL bitstream
+     *
+     *  A number of events is decoded from the MEL bitstream ahead of time
+     *  and stored in run/num_runs.
+     *  Each run represents the number of zero events before a one event.
+     */
+    struct dec_mel_st {
+      dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
+        k(0), num_runs(0), runs(0)
+      {}
+      // data decoding machinery
+      ui8* data;    //!<the address of data (or bitstream)
+      ui64 tmp;     //!<temporary buffer for read data
+      int bits;     //!<number of bits stored in tmp
+      int size;     //!<number of bytes in MEL code
+      bool unstuff; //!<true if the next bit needs to be unstuffed
+      int k;        //!<state of MEL decoder
+
+      // queue of decoded runs
+      int num_runs; //!<number of decoded runs left in runs (maximum 8)
+      ui64 runs;    //!<runs of decoded MEL codewords (7 bits/run)
+    };
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs the MEL bitstream
+     *
+     *  This design needs more bytes in the codeblock buffer than the length
+     *  of the cleanup pass by up to 2 bytes.
+     *
+     *  Unstuffing removes the MSB of the byte following a byte whose
+     *  value is 0xFF; this prevents sequences larger than 0xFF7F in value
+     *  from appearing the bitstream.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_read(dec_mel_st *melp)
+    {
+      if (melp->bits > 32)  //there are enough bits in the tmp variable
+        return;             // return without reading new data
+
+      ui32 val = 0xFFFFFFFF;       // feed in 0xFF if buffer is exhausted
+      if (melp->size > 4) {        // if there is data in the MEL segment
+        val = *(ui32*)melp->data;  // read 32 bits from MEL data
+        melp->data += 4;           // advance pointer
+        melp->size -= 4;           // reduce counter
+      }
+      else if (melp->size > 0)
+      { // 4 or less
+        int i = 0;
+        while (melp->size > 1) {
+          ui32 v = *melp->data++;    // read one byte at a time
+          ui32 m = ~(0xFFu << i);    // mask of location
+          val = (val & m) | (v << i);// put one byte in its correct location
+          --melp->size;
+          i += 8;
+        }
+        // size equal to 1
+        ui32 v = *melp->data++;    // the one before the last is different
+        v |= 0xF;                  // MEL and VLC segments can overlap
+        ui32 m = ~(0xFFu << i);
+        val = (val & m) | (v << i);
+        --melp->size;
+      }
+
+      // next we unstuff them before adding them to the buffer
+      int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
+                                     // the previously read byte requires
+                                     // unstuffing
+
+      // data is unstuffed and accumulated in t
+      // bits has the number of bits in t
+      ui32 t = val & 0xFF;
+      bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
+      bits -= unstuff; // there is one less bit in t if unstuffing is needed
+      t = t << (8 - unstuff); // move up to make room for the next byte
+
+      //this is a repeat of the above
+      t |= (val>>8) & 0xFF;
+      unstuff = (((val >> 8) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>16) & 0xFF;
+      unstuff = (((val >> 16) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>24) & 0xFF;
+      melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
+
+      // move t to tmp, and push the result all the way up, so we read from
+      // the MSB
+      melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
+      melp->bits += bits; //increment the number of bits in tmp
+    }
+
+    //************************************************************************/
+    /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs
+     *
+     *  Runs are stored in "runs" and the number of runs in "num_runs".
+     *  Each run represents a number of zero events that may or may not
+     *  terminate in a 1 event.
+     *  Each run is stored in 7 bits.  The LSB is 1 if the run terminates in
+     *  a 1 event, 0 otherwise.  The next 6 bits, for the case terminating
+     *  with 1, contain the number of consecutive 0 zero events * 2; for the
+     *  case terminating with 0, they store (number of consecutive 0 zero
+     *  events - 1) * 2.
+     *  A total of 6 bits (made up of 1 + 5) should have been enough.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_decode(dec_mel_st *melp)
+    {
+      static const int mel_exp[13] = { //MEL exponents
+        0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
+      };
+
+      if (melp->bits < 6) // if there are less than 6 bits in tmp
+        mel_read(melp);   // then read from the MEL bitstream
+                          // 6 bits is the largest decodable MEL cwd
+
+      //repeat so long that there is enough decodable bits in tmp,
+      // and the runs store is not full (num_runs < 8)
+      while (melp->bits >= 6 && melp->num_runs < 8)
+      {
+        int eval = mel_exp[melp->k]; // number of bits associated with state
+        int run = 0;
+        if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
+        { //one is found
+          run = 1 << eval;
+          run--; // consecutive runs of 0 events - 1
+          melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
+          melp->tmp <<= 1; // consume one bit from tmp
+          melp->bits -= 1;
+          run = run << 1; // a stretch of zeros not terminating in one
+        }
+        else
+        { //0 is found
+          run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
+          melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
+          melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
+          melp->bits -= eval + 1;
+          run = (run << 1) + 1; // a stretch of zeros terminating with one
+        }
+        eval = melp->num_runs * 7;           // 7 bits per run
+        melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
+        melp->runs |= ((ui64)run) << eval;   // store the value in runs
+        melp->num_runs++;                    // increment count
+      }
+    }
+
+    //************************************************************************/
+    /** @brief Initiates a dec_mel_st structure for MEL decoding and reads
+     *         some bytes in order to get the read address to a multiple
+     *         of 4
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     *  @param [in]  bbuf is a pointer to byte buffer
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline
+    void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
+    {
+      melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
+      melp->bits = 0;                  // 0 bits in tmp
+      melp->tmp = 0;                   //
+      melp->unstuff = false;           // no unstuffing
+      melp->size = scup - 1;           // size is the length of MEL+VLC-1
+      melp->k = 0;                     // 0 for state
+      melp->num_runs = 0;              // num_runs is 0
+      melp->runs = 0;                  //
+
+      //This code is borrowed; original is for a different architecture
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MEL segment
+      int num = 4 - (int)(intptr_t(melp->data) & 0x3);
+      for (int i = 0; i < num; ++i) { // this code is similar to mel_read
+        assert(melp->unstuff == false || melp->data[0] <= 0x8F);
+        ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
+                                                       //set data to 0xFF
+        if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
+                                       // see the standard
+        melp->data += melp->size-- > 0; //increment if the end is not reached
+        int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
+        melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
+        melp->bits += d_bits;  //increment tmp by number of bits
+        melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
+                                              //unstuffing
+      }
+      melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
+                                       // is the MSB
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves one run from dec_mel_st; if there are no runs stored
+     *         MEL segment is decoded
+     *
+     * @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    int mel_get_run(dec_mel_st *melp)
+    {
+      if (melp->num_runs == 0)  //if no runs, decode more bit from MEL segment
+        mel_decode(melp);
+
+      int t = melp->runs & 0x7F; //retrieve one run
+      melp->runs >>= 7;  // remove the retrieved run
+      melp->num_runs--;
+      return t; // return run
+    }
+
+    //************************************************************************/
+    /** @brief A structure for reading and unstuffing a segment that grows
+     *         backward, such as VLC and MRP
+     */
+    struct rev_struct {
+      rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
+      {}
+      //storage
+      ui8* data;     //!<pointer to where to read data
+      ui64 tmp;	     //!<temporary buffer of read data
+      ui32 bits;     //!<number of bits stored in tmp
+      int size;      //!<number of bytes left
+      bool unstuff;  //!<true if the last byte is more than 0x8F
+                     //!<then the current byte is unstuffed if it is 0x7F
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuff data from a backwardly-growing segment
+     *
+     *  This reader can read up to 8 bytes from before the VLC segment.
+     *  Care must be taken not read from unreadable memory, causing a
+     *  segmentation fault.
+     *
+     *  Note that there is another subroutine rev_read_mrp that is slightly
+     *  different.  The other one fills zeros when the buffer is exhausted.
+     *  This one basically does not care if the bytes are consumed, because
+     *  any extra data should not be used in the actual decoding.
+     *
+     *  Unstuffing is needed to prevent sequences more than 0xFF8F from
+     *  appearing in the bits stream; since we are reading backward, we keep
+     *  watch when a value larger than 0x8F appears in the bitstream.
+     *  If the byte following this is 0x7F, we unstuff this byte (ignore the
+     *  MSB of that byte, which should be 0).
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline
+    void rev_read(rev_struct *vlcp)
+    {
+      //process 4 bytes at a time
+      if (vlcp->bits > 32)  // if there are more than 32 bits in tmp, then
+        return;             // reading 32 bits can overflow vlcp->tmp
+      ui32 val = 0;
+      //the next line (the if statement) needs to be tested first
+      if (vlcp->size > 3)  // if there are more than 3 bytes left in VLC
+      {
+        // (vlcp->data - 3) move pointer back to read 32 bits at once
+        val = *(ui32*)(vlcp->data - 3); // then read 32 bits
+        vlcp->data -= 4;          // move data pointer back by 4
+        vlcp->size -= 4;          // reduce available byte by 4
+      }
+      else if (vlcp->size > 0)
+      { // 4 or less
+        int i = 24;
+        while (vlcp->size > 0) {
+          ui32 v = *vlcp->data--; // read one byte at a time
+          val |= (v << i);        // put byte in its correct location
+          --vlcp->size;
+          i -= 8;
+        }
+      }
+
+      __m128i tmp_vec = _mm_set1_epi32((int32_t)val);
+      tmp_vec = _mm_srlv_epi32(tmp_vec, _mm_setr_epi32(24, 16, 8, 0));
+      tmp_vec = _mm_and_si128(tmp_vec, _mm_set1_epi32(0xff));
+
+      __m128i unstuff_vec = _mm_cmpgt_epi32(tmp_vec, _mm_set1_epi32(0x8F));
+      bool unstuff_next = _mm_extract_epi32(unstuff_vec, 3);
+      unstuff_vec = _mm_slli_si128(unstuff_vec, 4);
+      unstuff_vec = _mm_insert_epi32(unstuff_vec, vlcp->unstuff * 0xffffffff, 0);
+
+      __m128i val_7f = _mm_set1_epi32(0x7F);
+      __m128i this_byte_7f = _mm_cmpeq_epi32(_mm_and_si128(tmp_vec, val_7f), val_7f);
+      unstuff_vec = _mm_and_si128(unstuff_vec, this_byte_7f);
+      unstuff_vec = _mm_srli_epi32(unstuff_vec, 31);
+
+      __m128i inc_sum = _mm_sub_epi32(_mm_set1_epi32(8), unstuff_vec);
+      inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4));
+      inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8));
+      ui32 total_bits = (ui32)_mm_extract_epi32(inc_sum, 3);
+
+      __m128i final_shift = _mm_slli_si128(inc_sum, 4);
+      tmp_vec = _mm_sllv_epi32(tmp_vec, final_shift);
+      tmp_vec = _mm_or_si128(tmp_vec, _mm_bsrli_si128(tmp_vec, 8));
+
+      ui64 tmp = (ui32)_mm_cvtsi128_si32(tmp_vec) | (ui32)_mm_extract_epi32(tmp_vec, 1);
+
+      vlcp->unstuff = unstuff_next;
+      vlcp->tmp |= tmp << vlcp->bits;
+      vlcp->bits += total_bits;
+    }
+
+    //************************************************************************/
+    /** @brief Initiates the rev_struct structure and reads a few bytes to
+     *         move the read address to multiple of 4
+     *
+     *  There is another similar rev_init_mrp subroutine.  The difference is
+     *  that this one, rev_init, discards the first 12 bits (they have the
+     *  sum of the lengths of VLC and MEL segments), and first unstuff depends
+     *  on first 4 bits.
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline
+    void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
+    {
+      //first byte has only the upper 4 bits
+      vlcp->data = data + lcup - 2;
+
+      //size can not be larger than this, in fact it should be smaller
+      vlcp->size = scup - 2;
+
+      ui32 d = *vlcp->data--; // read one byte (this is a half byte)
+      vlcp->tmp = d >> 4;    // both initialize and set
+      vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
+      vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
+      // To read 32 bits, read from (vlcp->data - 3)
+      int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
+      int tnum = num < vlcp->size ? num : vlcp->size;
+      for (int i = 0; i < tnum; ++i) {
+        ui64 d;
+        d = *vlcp->data--;  // read one byte and move read pointer
+        //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
+        ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
+        vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
+        vlcp->bits += d_bits;
+        vlcp->unstuff = d > 0x8F; // for next byte
+      }
+      vlcp->size -= tnum;
+      rev_read(vlcp);  // read another 32 buts
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves 32 bits from the head of a rev_struct structure
+     *
+     *  By the end of this call, vlcp->tmp must have no less than 33 bits
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline
+    ui32 rev_fetch(rev_struct *vlcp)
+    {
+      if (vlcp->bits < 32)  // if there are less then 32 bits, read more
+      {
+        rev_read(vlcp);     // read 32 bits, but unstuffing might reduce this
+        if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
+          rev_read(vlcp);   // read another 32
+      }
+      return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
+    }
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    static inline
+    ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
+    {
+      assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
+      vlcp->tmp >>= num_bits;         // remove bits
+      vlcp->bits -= num_bits;         // decrement the number of bits
+      return (ui32)vlcp->tmp;
+    }
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs from rev_struct
+     *
+     *  This is different than rev_read in that this fills in zeros when the
+     *  the available data is consumed.  The other does not care about the
+     *  values when all data is consumed.
+     *
+     *  See rev_read for more information about unstuffing
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline
+    void rev_read_mrp(rev_struct *mrp)
+    {
+      //process 4 bytes at a time
+      if (mrp->bits > 32)
+        return;
+      ui32 val = 0;
+      if (mrp->size > 3) // If there are 3 byte or more
+      { // (mrp->data - 3) move pointer back to read 32 bits at once
+        val = *(ui32*)(mrp->data - 3); // read 32 bits
+        mrp->data -= 4;                // move back pointer
+        mrp->size -= 4;                // reduce count
+      }
+      else if (mrp->size > 0)
+      {
+        int i = 24;
+        while (mrp->size > 0) {
+          ui32 v = *mrp->data--; // read one byte at a time
+          val |= (v << i);       // put byte in its correct location
+          --mrp->size;
+          i -= 8;
+        }
+      }
+
+      //accumulate in tmp, and keep count in bits
+      ui32 bits, tmp = val >> 24;
+
+      //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
+      bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
+      bool unstuff = (val >> 24) > 0x8F;
+
+      //process the next byte
+      tmp |= ((val >> 16) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 16) & 0xFF) > 0x8F;
+
+      tmp |= ((val >> 8) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 8) & 0xFF) > 0x8F;
+
+      tmp |= (val & 0xFF) << bits;
+      bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = (val & 0xFF) > 0x8F;
+
+      mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
+      mrp->bits += bits;
+      mrp->unstuff = unstuff;             // next byte
+    }
+
+    //************************************************************************/
+    /** @brief Initialized rev_struct structure for MRP segment, and reads
+     *         a number of bytes such that the next 32 bits read are from
+     *         an address that is a multiple of 4. Note this is designed for
+     *         an architecture that read size must be compatible with the
+     *         alignment of the read address
+     *
+     *  There is another similar subroutine rev_init.  This subroutine does
+     *  NOT skip the first 12 bits, and starts with unstuff set to true.
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  len2 is the length of SPP+MRP segments
+     */
+    static inline
+    void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
+    {
+      mrp->data = data + lcup + len2 - 1;
+      mrp->size = len2;
+      mrp->unstuff = true;
+      mrp->bits = 0;
+      mrp->tmp = 0;
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MRP stream
+      int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
+      for (int i = 0; i < num; ++i) {
+        ui64 d;
+        //read a byte, 0 if no more data
+        d = (mrp->size-- > 0) ? *mrp->data-- : 0;
+        //check if unstuffing is needed
+        ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
+        mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
+        mrp->bits += d_bits;
+        mrp->unstuff = d > 0x8F; // for next byte
+      }
+      rev_read_mrp(mrp);
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves 32 bits from the head of a rev_struct structure
+     *
+     *  By the end of this call, mrp->tmp must have no less than 33 bits
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline
+    ui32 rev_fetch_mrp(rev_struct *mrp)
+    {
+      if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
+      {
+        rev_read_mrp(mrp);    // read 30-32 bits from mrp
+        if (mrp->bits < 32)   // if there is a space of 32 bits
+          rev_read_mrp(mrp);  // read more
+      }
+      return (ui32)mrp->tmp;  // return the head of mrp->tmp
+    }
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
+    {
+      assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
+      mrp->tmp >>= num_bits;  // discard the lowest num_bits bits
+      mrp->bits -= num_bits;
+      return (ui32)mrp->tmp;  // return data after consumption
+    }
+
+    //************************************************************************/
+    /** @brief State structure for reading and unstuffing of forward-growing
+     *         bitstreams; these are: MagSgn and SPP bitstreams
+     */
+    struct frwd_struct_avx2 {
+      const ui8* data;  //!<pointer to bitstream
+      ui8 tmp[48];      //!<temporary buffer of read data + 16 extra
+      ui32 bits;        //!<number of bits stored in tmp
+      ui32 unstuff;     //!<1 if a bit needs to be unstuffed from next byte
+      int size;         //!<size of data
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuffs 16 bytes from forward-growing bitstream
+     *
+     *  A template is used to accommodate a different requirement for
+     *  MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is
+     *  consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in.
+     *  X controls this value.
+     *
+     *  Unstuffing prevent sequences that are more than 0xFF7F from appearing
+     *  in the compressed sequence.  So whenever a value of 0xFF is coded, the
+     *  MSB of the next byte is set 0 and must be ignored during decoding.
+     *
+     *  Reading can go beyond the end of buffer by up to 16 bytes.
+     *
+     *  @tparam       X is the value fed in when the bitstream is exhausted
+     *  @param  [in]  msp is a pointer to frwd_struct_avx2 structure
+     *
+     */
+    template<int X>
+    static inline
+    void frwd_read(frwd_struct_avx2 *msp)
+    {
+      assert(msp->bits <= 128);
+
+      __m128i offset, val, validity, all_xff;
+      val = _mm_loadu_si128((__m128i*)msp->data);
+      int bytes = msp->size >= 16 ? 16 : msp->size;
+      validity = _mm_set1_epi8((char)bytes);
+      msp->data += bytes;
+      msp->size -= bytes;
+      int bits = 128;
+      offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100);
+      validity = _mm_cmpgt_epi8(validity, offset);
+      all_xff = _mm_set1_epi8(-1);
+      if (X == 0xFF) // the compiler should remove this if statement
+      {
+        __m128i t = _mm_xor_si128(validity, all_xff); // complement
+        val = _mm_or_si128(t, val); // fill with 0xFF
+      }
+      else if (X == 0)
+        val = _mm_and_si128(validity, val); // fill with zeros
+      else
+        assert(0);
+
+      __m128i ff_bytes;
+      ff_bytes = _mm_cmpeq_epi8(val, all_xff);
+      ff_bytes = _mm_and_si128(ff_bytes, validity);
+      ui32 flags = (ui32)_mm_movemask_epi8(ff_bytes);
+      flags <<= 1; // unstuff following byte
+      ui32 next_unstuff = flags >> 16;
+      flags |= msp->unstuff;
+      flags &= 0xFFFF;
+      while (flags)
+      { // bit unstuffing occurs on average once every 256 bytes
+        // therefore it is not an issue if it is a bit slow
+        // here we process 16 bytes
+        --bits; // consuming one stuffing bit
+
+        ui32 loc = 31 - count_leading_zeros(flags);
+        flags ^= 1 << loc;
+
+        __m128i m, t, c;
+        t = _mm_set1_epi8((char)loc);
+        m = _mm_cmpgt_epi8(offset, t);
+
+        t = _mm_and_si128(m, val);  // keep bits at locations larger than loc
+        c = _mm_srli_epi64(t, 1);   // 1 bits left
+        t = _mm_srli_si128(t, 8);   // 8 bytes left
+        t = _mm_slli_epi64(t, 63);  // keep the MSB only
+        t = _mm_or_si128(t, c);     // combine the above 3 steps
+
+        val = _mm_or_si128(t, _mm_andnot_si128(m, val));
+      }
+
+      // combine with earlier data
+      assert(msp->bits >= 0 && msp->bits <= 128);
+      int cur_bytes = msp->bits >> 3;
+      int cur_bits = msp->bits & 7;
+      __m128i b1, b2;
+      b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits));
+      b2 = _mm_slli_si128(val, 8);  // 8 bytes right
+      b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits));
+      b1 = _mm_or_si128(b1, b2);
+      b2 = _mm_loadu_si128((__m128i*)(msp->tmp + cur_bytes));
+      b2 = _mm_or_si128(b1, b2);
+      _mm_storeu_si128((__m128i*)(msp->tmp + cur_bytes), b2);
+
+      int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits;
+      cur_bytes = (msp->bits + (ui32)consumed_bits + 7) >> 3; // round up
+      int upper = _mm_extract_epi16(val, 7);
+      upper >>= consumed_bits - 128 + 16;
+      msp->tmp[cur_bytes] = (ui8)upper; // copy byte
+
+      msp->bits += (ui32)bits;
+      msp->unstuff = next_unstuff;   // next unstuff
+      assert(msp->unstuff == 0 || msp->unstuff == 1);
+    }
+
+    //************************************************************************/
+    /** @brief Initialize frwd_struct_avx2 struct and reads some bytes
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct_avx2
+     *  @param [in]  data is a pointer to the start of data
+     *  @param [in]  size is the number of byte in the bitstream
+     */
+    template<int X>
+    static inline
+    void frwd_init(frwd_struct_avx2 *msp, const ui8* data, int size)
+    {
+      msp->data = data;
+      _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128());
+      _mm_storeu_si128((__m128i *)msp->tmp + 1, _mm_setzero_si128());
+      _mm_storeu_si128((__m128i *)msp->tmp + 2, _mm_setzero_si128());
+
+      msp->bits = 0;
+      msp->unstuff = 0;
+      msp->size = size;
+
+      frwd_read<X>(msp); // read 128 bits more
+    }
+
+    //************************************************************************/
+    /** @brief Consume num_bits bits from the bitstream of frwd_struct_avx2
+     *
+     *  @param [in]  msp is a pointer to frwd_struct_avx2
+     *  @param [in]  num_bits is the number of bit to consume
+     */
+    static inline
+    void frwd_advance(frwd_struct_avx2 *msp, ui32 num_bits)
+    {
+      assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
+      msp->bits -= num_bits;
+
+      __m128i *p = (__m128i*)(msp->tmp + ((num_bits >> 3) & 0x18));
+      num_bits &= 63;
+
+      __m128i v0, v1, c0, c1, t;
+      v0 = _mm_loadu_si128(p);
+      v1 = _mm_loadu_si128(p + 1);
+
+      // shift right by num_bits
+      c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits));
+      t = _mm_srli_si128(v0, 8);
+      t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
+      c0 = _mm_or_si128(c0, t);
+      t = _mm_slli_si128(v1, 8);
+      t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
+      c0 = _mm_or_si128(c0, t);
+
+      _mm_storeu_si128((__m128i*)msp->tmp, c0);
+
+      c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits));
+      t = _mm_srli_si128(v1, 8);
+      t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
+      c1 = _mm_or_si128(c1, t);
+
+      _mm_storeu_si128((__m128i*)msp->tmp + 1, c1);
+    }
+
+    //************************************************************************/
+    /** @brief Fetches 32 bits from the frwd_struct_avx2 bitstream
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct_avx2
+     */
+    template<int X>
+    static inline
+    __m128i frwd_fetch(frwd_struct_avx2 *msp)
+    {
+      if (msp->bits <= 128)
+      {
+        frwd_read<X>(msp);
+        if (msp->bits <= 128) //need to test
+          frwd_read<X>(msp);
+      }
+      __m128i t = _mm_loadu_si128((__m128i*)msp->tmp);
+      return t;
+    }
+
+    //************************************************************************/
+    /** @brief decodes twos consecutive quads (one octet), using 32 bit data
+     *
+     *  @param inf_u_q  decoded VLC code, with interleaved u values
+     *  @param U_q      U values
+     *  @param magsgn   structure for forward data buffer
+     *  @param p        bitplane at which we are decoding
+     *  @param vn       used for handling E values (stores v_n values)
+     *  @return __m256i decoded two quads
+     */
+    static inline __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q, frwd_struct_avx2* magsgn, ui32 p, __m128i& vn) {
+        __m256i row = _mm256_setzero_si256();
+
+        // we keeps e_k, e_1, and rho in w2
+        __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110));
+        __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256());
+
+        if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
+        {
+            flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8));
+
+            // U_q holds U_q for this quad
+            // flags has e_k, e_1, and rho such that e_k is sitting in the
+            // 0x8000, e_1 in 0x800, and rho in 0x80
+
+            // next e_k and m_n
+            __m256i m_n;
+            __m256i w0 = _mm256_srli_epi32(flags, 15); // e_k
+            m_n = _mm256_sub_epi32(U_q, w0);
+            m_n = _mm256_andnot_si256(insig, m_n);
+
+            // find cumulative sums
+            // to find at which bit in ms_vec the sample starts
+            __m256i inc_sum = m_n; // inclusive scan
+            inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
+            inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
+            int total_mn1 = _mm256_extract_epi16(inc_sum, 6);
+            int total_mn2 = _mm256_extract_epi16(inc_sum, 14);
+
+            __m128i ms_vec0 = _mm_setzero_si128();
+            __m128i ms_vec1 = _mm_setzero_si128();
+            if (total_mn1) {
+                ms_vec0 = frwd_fetch<0xFF>(magsgn);
+                frwd_advance(magsgn, (ui32)total_mn1);
+            }
+            if (total_mn2) {
+                ms_vec1 = frwd_fetch<0xFF>(magsgn);
+                frwd_advance(magsgn, (ui32)total_mn2);
+            }
+
+            __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
+
+            __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4); // exclusive scan
+
+            // find the starting byte and starting bit
+            __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3);
+            __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7));
+            byte_idx = _mm256_shuffle_epi8(byte_idx,
+                _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
+            byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100));
+            __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
+            byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101));
+            __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
+
+            // shift samples values to correct location
+            bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16));
+
+            __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1);
+            __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1);
+
+            __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx);
+            bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
+            d0 = _mm256_mullo_epi16(d0, bit_shift);
+            d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
+            d1 = _mm256_mullo_epi16(d1, bit_shift);
+            d1 = _mm256_and_si256(d1, _mm256_set1_epi32((si32)0xFF00FF00)); // 8 in MSB
+            d0 = _mm256_or_si256(d0, d1);
+
+            // find location of e_k and mask
+            __m256i shift;
+            __m256i ones = _mm256_set1_epi32(1);
+            __m256i twos = _mm256_set1_epi32(2);
+            __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones);
+            U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
+            U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0);
+            w0 = _mm256_sub_epi32(twos, w0);
+            shift = _mm256_sllv_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31
+            ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones));
+
+            // next e_1
+            w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800));
+            w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256());
+            w0 = _mm256_andnot_si256(w0, shift);  // e_1 in correct position
+            ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
+            w0 = _mm256_slli_epi32(ms_vec, 31);   // sign
+            ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
+            __m256i tvn = ms_vec;
+            ms_vec = _mm256_add_epi32(ms_vec, twos);// + 2
+            ms_vec = _mm256_slli_epi32(ms_vec, (si32)p - 1);
+            ms_vec = _mm256_or_si256(ms_vec, w0); // sign
+            row = _mm256_andnot_si256(insig, ms_vec); // significant only
+
+            ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
+
+            tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504));
+
+            vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn));
+            vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1));
+        }
+        return row;
+    }
+
+
+   //************************************************************************/
+    /** @brief decodes twos consecutive quads (one octet), using 16 bit data
+     *
+     *  @param inf_u_q  decoded VLC code, with interleaved u values
+     *  @param U_q      U values
+     *  @param magsgn   structure for forward data buffer
+     *  @param p        bitplane at which we are decoding
+     *  @param vn       used for handling E values (stores v_n values)
+     *  @return __m128i decoded quad
+     */
+
+    static inline __m256i decode_four_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct_avx2* magsgn, ui32 p, __m128i& vn) {
+
+        __m256i w0;     // workers
+        __m256i insig;  // lanes hold FF's if samples are insignificant
+        __m256i flags;  // lanes hold e_k, e_1, and rho
+
+        __m256i row = _mm256_setzero_si256();
+        __m128i ddd = _mm_shuffle_epi8(inf_u_q,
+            _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100));
+        w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
+            _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
+        // we keeps e_k, e_1, and rho in w2
+        flags = _mm256_and_si256(w0,
+            _mm256_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110,
+                             (si16)0x8880, 0x4440, 0x2220, 0x1110,
+                             (si16)0x8880, 0x4440, 0x2220, 0x1110,
+                             (si16)0x8880, 0x4440, 0x2220, 0x1110));
+        insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256());
+        if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
+        {
+            ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q);
+            __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
+                _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
+            flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8));
+
+            // U_q holds U_q for this quad
+            // flags has e_k, e_1, and rho such that e_k is sitting in the
+            // 0x8000, e_1 in 0x800, and rho in 0x80
+
+            // next e_k and m_n
+            __m256i m_n;
+            w0 = _mm256_srli_epi16(flags, 15); // e_k
+            m_n = _mm256_sub_epi16(U_q_avx, w0);
+            m_n = _mm256_andnot_si256(insig, m_n);
+
+            // find cumulative sums
+            // to find at which bit in ms_vec the sample starts
+            __m256i inc_sum = m_n; // inclusive scan
+            inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2));
+            inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
+            inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
+            int total_mn1 = _mm256_extract_epi16(inc_sum, 7);
+            int total_mn2 = _mm256_extract_epi16(inc_sum, 15);
+            __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2); // exclusive scan
+
+            __m128i ms_vec0 = _mm_setzero_si128();
+            __m128i ms_vec1 = _mm_setzero_si128();
+            if (total_mn1) {
+                ms_vec0 = frwd_fetch<0xFF>(magsgn);
+                frwd_advance(magsgn, (ui32)total_mn1);
+            }
+            if (total_mn2) {
+                ms_vec1 = frwd_fetch<0xFF>(magsgn);
+                frwd_advance(magsgn, (ui32)total_mn2);
+            }
+
+            __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
+
+            // find the starting byte and starting bit
+            __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3);
+            __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7));
+            byte_idx = _mm256_shuffle_epi8(byte_idx,
+                _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
+                    0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
+                    0x0606, 0x0404, 0x0202, 0x0000));
+            byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100));
+            __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
+            byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101));
+            __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
+
+            // shift samples values to correct location
+            __m256i bit_shift = _mm256_shuffle_epi8(
+                _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
+                    1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1,
+                    1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
+            bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
+            d0 = _mm256_mullo_epi16(d0, bit_shift);
+            d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
+            d1 = _mm256_mullo_epi16(d1, bit_shift);
+            d1 = _mm256_and_si256(d1, _mm256_set1_epi16((si16)0xFF00)); // 8 in MSB
+            d0 = _mm256_or_si256(d0, d1);
+
+            // find location of e_k and mask
+            __m256i shift, t0, t1, Uq0, Uq1;
+            __m256i ones = _mm256_set1_epi16(1);
+            __m256i twos = _mm256_set1_epi16(2);
+            __m256i U_q_m1 = _mm256_sub_epi32(U_q_avx, ones);
+            Uq0 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
+            Uq1 = _mm256_bsrli_epi128(U_q_m1, 14);
+            w0 = _mm256_sub_epi16(twos, w0);
+            t0 = _mm256_and_si256(w0, _mm256_set_epi64x(0, -1, 0, -1));
+            t1 = _mm256_and_si256(w0, _mm256_set_epi64x(-1, 0, -1, 0));
+            {//no _mm256_sllv_epi16 in avx2
+                __m128i t_0_sse = _mm256_castsi256_si128(t0);
+                t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq0));
+                __m128i t_1_sse = _mm256_extracti128_si256(t0 , 0x1);
+                t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq0, 0x1));
+                t0 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
+
+                t_0_sse = _mm256_castsi256_si128(t1);
+                t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq1));
+                t_1_sse = _mm256_extracti128_si256(t1, 0x1);
+                t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq1, 0x1));
+                t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
+            }
+            shift = _mm256_or_si256(t0, t1);
+            ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones));
+
+            // next e_1
+            w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800));
+            w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256());
+            w0 = _mm256_andnot_si256(w0, shift);  // e_1 in correct position
+            ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
+            w0 = _mm256_slli_epi16(ms_vec, 15);   // sign
+            ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
+            __m256i tvn = ms_vec;
+            ms_vec = _mm256_add_epi16(ms_vec, twos);// + 2
+            ms_vec = _mm256_slli_epi16(ms_vec, (si32)p - 1);
+            ms_vec = _mm256_or_si256(ms_vec, w0); // sign
+            row = _mm256_andnot_si256(insig, ms_vec); // significant only
+
+            ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
+
+            __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec,
+                _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
+            __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec,
+                _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
+            ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2);
+
+            vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec));
+            vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1));
+        }
+        return row;
+    }
+
+    // https://stackoverflow.com/a/58827596
+    inline __m256i avx2_lzcnt_epi32(__m256i v) {
+        // prevent value from being rounded up to the next power of two
+        v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);  // keep 8 MSB
+
+        v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));    // convert an integer to float
+        v = _mm256_srli_epi32(v, 23);                   // shift down the exponent
+        v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);  // undo bias
+        v = _mm256_min_epi16(v, _mm256_set1_epi32(32));    // clamp at 32
+
+        return v;
+    }
+
+    //************************************************************************/
+    /** @brief Decodes one codeblock, processing the cleanup, siginificance
+     *         propagation, and magnitude refinement pass
+     *
+     *  @param [in]   coded_data is a pointer to bitstream
+     *  @param [in]   decoded_data is a pointer to decoded codeblock data buf.
+     *  @param [in]   missing_msbs is the number of missing MSBs
+     *  @param [in]   num_passes is the number of passes: 1 if CUP only,
+     *                2 for CUP+SPP, and 3 for CUP+SPP+MRP
+     *  @param [in]   lengths1 is the length of cleanup pass
+     *  @param [in]   lengths2 is the length of refinement passes (either SPP
+     *                only or SPP+MRP)
+     *  @param [in]   width is the decoded codeblock width
+     *  @param [in]   height is the decoded codeblock height
+     *  @param [in]   stride is the decoded codeblock buffer stride
+     *  @param [in]   stripe_causal is true for stripe causal mode
+     */
+    bool ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data,
+                                    ui32 missing_msbs, ui32 num_passes,
+                                    ui32 lengths1, ui32 lengths2,
+                                    ui32 width, ui32 height, ui32 stride,
+                                    bool stripe_causal)
+    {
+      static bool insufficient_precision = false;
+      static bool modify_code = false;
+      static bool truncate_spp_mrp = false;
+
+      if (num_passes > 1 && lengths2 == 0)
+      {
+        OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
+                              "one coding pass, but zero length for "
+                              "2nd and potential 3rd pass.");
+        num_passes = 1;
+      }
+
+      if (num_passes > 3)
+      {
+        OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
+                              "This codeblocks has %d passes.",
+                              num_passes);
+        return false;
+      }
+
+      if (missing_msbs > 30) // p < 0
+      {
+        if (insufficient_precision == false)
+        {
+          insufficient_precision = true;
+          OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
+                                "codeblock. This message will not be "
+                                "displayed again.");
+        }
+        return false;
+      }
+      else if (missing_msbs == 30) // p == 0
+      { // not enough precision to decode and set the bin center to 1
+        if (modify_code == false) {
+          modify_code = true;
+          OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
+                                "pass. The code can be modified to support "
+                                "this case. This message will not be "
+                                "displayed again.");
+        }
+         return false;         // 32 bits are not enough to decode this
+       }
+      else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
+      {
+        if (num_passes > 1) {
+          num_passes = 1;
+          if (truncate_spp_mrp == false) {
+            truncate_spp_mrp = true;
+            OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
+                                  "nor MagRef passes; both will be skipped. "
+                                  "This message will not be displayed "
+                                  "again.");
+          }
+        }
+      }
+      ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
+      // There is a way to handle the case of p == 0, but a different path
+      // is required
+
+      if (lengths1 < 2)
+      {
+        OJPH_WARN(0x00010006, "Wrong codeblock length.");
+        return false;
+      }
+
+      // read scup and fix the bytes there
+      int lcup, scup;
+      lcup = (int)lengths1;  // length of CUP
+      //scup is the length of MEL + VLC
+      scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
+      if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
+        return false;
+
+      // The temporary storage scratch holds two types of data in an
+      // interleaved fashion. The interleaving allows us to use one
+      // memory pointer.
+      // We have one entry for a decoded VLC code, and one entry for UVLC.
+      // Entries are 16 bits each, corresponding to one quad,
+      // but since we want to use XMM registers of the SSE family
+      // of SIMD; we allocated 16 bytes or more per quad row; that is,
+      // the width is no smaller than 16 bytes (or 8 entries), and the
+      // height is 512 quads
+      // Each VLC entry contains, in the following order, starting
+      // from MSB
+      // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
+      // Each entry in UVLC contains u_q
+      // One extra row to handle the case of SPP propagating downwards
+      // when codeblock width is 4
+      ui16 scratch[8 * 513] = {0};          // 8+ kB
+
+      // We need an extra two entries (one inf and one u_q) beyond
+      // the last column.
+      // If the block width is 4 (2 quads), then we use sstr of 8
+      // (enough for 4 quads). If width is 8 (4 quads) we use
+      // sstr is 16 (enough for 8 quads). For a width of 16 (8
+      // quads), we use 24 (enough for 12 quads).
+      ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
+
+      assert((stride & 0x3) == 0);
+
+      ui32 mmsbp2 = missing_msbs + 2;
+
+      // The cleanup pass is decoded in two steps; in step one,
+      // the VLC and MEL segments are decoded, generating a record that
+      // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
+      // This information should be sufficient for the next step.
+      // In step 2, we decode the MagSgn segment.
+
+      // step 1 decoding VLC and MEL segments
+      {
+        // init structures
+        dec_mel_st mel;
+        mel_init(&mel, coded_data, lcup, scup);
+        rev_struct vlc;
+        rev_init(&vlc, coded_data, lcup, scup);
+
+        int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
+                                     // data represented as runs of 0 events
+                                     // See mel_decode description
+
+        ui32 vlc_val;
+        ui32 c_q = 0;
+        ui16 *sp = scratch;
+        //initial quad row
+        for (ui32 x = 0; x < width; sp += 4)
+        {
+          // decode VLC
+          /////////////
+
+          // first quad
+          vlc_val = rev_fetch(&vlc);
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
+
+          // if context is zero, use one MEL event
+          if (c_q == 0) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // Is the run terminated in 1? if so, use decoded VLC code,
+            // otherwise, discard decoded data, since we will decoded again
+            // using a different context
+            t0 = (run == -1) ? t0 : 0;
+
+            // is run -1 or -2? this means a run has been consumed
+            if (run < 0)
+              run = mel_get_run(&mel);  // get another run
+          }
+          //run -= (c_q == 0) ? 2 : 0;
+          //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[0] = t0;
+          x += 2;
+
+          // prepare context for the next quad; eqn. 1 in ITU T.814
+          c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
+
+          //remove data from vlc stream (0 bits are removed if vlc is not used)
+          vlc_val = rev_advance(&vlc, t0 & 0x7);
+
+          //second quad
+          ui16 t1 = 0;
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
+
+          // if context is zero, use one MEL event
+          if (c_q == 0 && x < width) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // if event is 0, discard decoded t1
+            t1 = (run == -1) ? t1 : 0;
+
+            if (run < 0) // have we consumed all events in a run
+              run = mel_get_run(&mel); // if yes, then get another run
+          }
+          t1 = x < width ? t1 : 0;
+          //run -= (c_q == 0 && x < width) ? 2 : 0;
+          //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[2] = t1;
+          x += 2;
+
+          //prepare context for the next quad, eqn. 1 in ITU T.814
+          c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
+
+          //remove data from vlc stream, if qinf is not used, cwdlen is 0
+          vlc_val = rev_advance(&vlc, t1 & 0x7);
+
+          // decode u
+          /////////////
+          // uvlc_mode is made up of u_offset bits from the quad pair
+          ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+          if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
+          {                     // the MEL run of events
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
+                                                 // is 0x40
+
+            if (run < 0)//if run is consumed (run is -1 or -2), get another run
+              run = mel_get_run(&mel);
+          }
+          //run -= (uvlc_mode == 0xc0) ? 2 : 0;
+          //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+
+          //decode uvlc_mode to get u for both quads
+          ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
+          //remove total prefix length
+          vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
+          uvlc_entry >>= 3;
+          //extract suffixes for quad 0 and 1
+          ui32 len = uvlc_entry & 0xF;           //suffix length for 2 quads
+          ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
+          vlc_val = rev_advance(&vlc, len);
+          ojph_unused(vlc_val); //static code analysis: unused value
+          uvlc_entry >>= 4;
+          // quad 0 length
+          len = uvlc_entry & 0x7; // quad 0 suffix length
+          uvlc_entry >>= 3;
+          ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
+          sp[1] = u_q;
+          u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len));  //kappa == 1
+          sp[3] = u_q;
+        }
+        sp[0] = sp[1] = 0;
+
+        //non initial quad rows
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          c_q = 0;                                // context
+          ui16 *sp = scratch + (y >> 1) * sstr;   // this row of quads
+
+          for (ui32 x = 0; x < width; sp += 4)
+          {
+            // decode VLC
+            /////////////
+
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
+
+            // first quad
+            vlc_val = rev_fetch(&vlc);
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
+
+            // if context is zero, use one MEL event
+            if (c_q == 0) //zero context
+            {
+              run -= 2; //subtract 2, since events number is multiplied by 2
+
+              // Is the run terminated in 1? if so, use decoded VLC code,
+              // otherwise, discard decoded data, since we will decoded again
+              // using a different context
+              t0 = (run == -1) ? t0 : 0;
+
+              // is run -1 or -2? this means a run has been consumed
+              if (run < 0)
+                run = mel_get_run(&mel);  // get another run
+            }
+            //run -= (c_q == 0) ? 2 : 0;
+            //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[0] = t0;
+            x += 2;
+
+            // prepare context for the next quad; eqn. 2 in ITU T.814
+            // sigma_q (w, sw)
+            c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[0 - (si32)sstr] & 0x80;
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
+
+            //remove data from vlc stream (0 bits are removed if vlc is unused)
+            vlc_val = rev_advance(&vlc, t0 & 0x7);
+
+            //second quad
+            ui16 t1 = 0;
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
+
+            // if context is zero, use one MEL event
+            if (c_q == 0 && x < width) //zero context
+            {
+              run -= 2; //subtract 2, since events number if multiplied by 2
+
+              // if event is 0, discard decoded t1
+              t1 = (run == -1) ? t1 : 0;
+
+              if (run < 0) // have we consumed all events in a run
+                run = mel_get_run(&mel); // if yes, then get another run
+            }
+            t1 = x < width ? t1 : 0;
+            //run -= (c_q == 0 && x < width) ? 2 : 0;
+            //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[2] = t1;
+            x += 2;
+
+            // partial c_q, will be completed when we process the next quad
+            // sigma_q (w, sw)
+            c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[2 - (si32)sstr] & 0x80;
+
+            //remove data from vlc stream, if qinf is not used, cwdlen is 0
+            vlc_val = rev_advance(&vlc, t1 & 0x7);
+
+            // decode u
+            /////////////
+            // uvlc_mode is made up of u_offset bits from the quad pair
+            ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+            ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
+            //remove total prefix length
+            vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
+            uvlc_entry >>= 3;
+            //extract suffixes for quad 0 and 1
+            ui32 len = uvlc_entry & 0xF;           //suffix length for 2 quads
+            ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
+            vlc_val = rev_advance(&vlc, len);
+            ojph_unused(vlc_val); //static code analysis: unused value
+            uvlc_entry >>= 4;
+            // quad 0 length
+            len = uvlc_entry & 0x7; // quad 0 suffix length
+            uvlc_entry >>= 3;
+            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
+            sp[1] = u_q;
+            u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
+            sp[3] = u_q;
+          }
+          sp[0] = sp[1] = 0;
+        }
+      }
+
+      // step2 we decode magsgn
+      // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
+      // The 32 bit path decode 16 bits data, for which one would think
+      // 16 bits are enough, because we want to put in the center of the
+      // bin.
+      // If you have mmsbp2 equals 16 bit, and reversible coding, and
+      // no bitplanes are missing, then we can decoding using the 16 bit
+      // path, but we are not doing this here.
+      if (mmsbp2 >= 16)
+      {
+        // We allocate a scratch row for storing v_n values.
+        // We have 512 quads horizontally.
+        // We may go beyond the last entry by up to 4 entries.
+        // Here we allocate additional 8 entries.
+        // There are two rows in this structure, the bottom
+        // row is used to store processed entries.
+        const int v_n_size = 512 + 16;
+        ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
+
+        frwd_struct_avx2 magsgn;
+        frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
+
+        const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
+
+        {
+          ui16 *sp = scratch;
+          ui32 *vp = v_n_scratch;
+          ui32 *dp = decoded_data;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
+          {
+            __m128i vn = _mm_set1_epi32(2);
+
+            __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
+            inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
+
+            __m256i U_q = _mm256_srli_epi32(inf_u_q, 16);
+            __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2);
+            if (!_mm256_testz_si256(w, w)) {
+                return false;
+            }
+
+            __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn);
+            row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
+            _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
+            _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
+
+            __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
+            w0 = _mm_or_si128(w0, vn);
+            _mm_storeu_si128((__m128i*)vp, w0);
+          }
+        }
+
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          {
+            // perform 31 - count_leading_zeros(*vp) here
+            ui32 *vp = v_n_scratch;
+            ui16* sp = scratch + (y >> 1) * sstr;
+
+            const __m256i avx_31 = _mm256_set1_epi32(31);
+            const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
+            const __m256i avx_1 = _mm256_set1_epi32(1);
+            const __m256i avx_0 = _mm256_setzero_si256();
+
+            for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) {
+              __m256i v = _mm256_loadu_si256((__m256i*)vp);
+              __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1));
+              v = _mm256_or_si256(v, v_p1);
+              v = avx2_lzcnt_epi32(v);
+              v = _mm256_sub_epi32(avx_31, v);
+
+              __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
+              __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
+              __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
+              gamma = _mm256_and_si256(gamma, w0);
+              gamma = _mm256_cmpeq_epi32(gamma, avx_0);
+
+              v = _mm256_andnot_si256(gamma, v);
+              v = _mm256_max_epi32(v, avx_1);
+
+              inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
+              v = _mm256_add_epi32(inf_u_q, v);
+
+              w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2);
+              if (!_mm256_testz_si256(w0, w0)) {
+                  return false;
+              }
+
+              _mm256_storeu_si256((__m256i*)(vp + v_n_size), v);
+            }
+          }
+
+          ui32 *vp = v_n_scratch;
+          ui16 *sp = scratch + (y >> 1) * sstr;
+          ui32 *dp = decoded_data + y * stride;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) {
+            //process two quads
+            __m128i vn = _mm_set1_epi32(2);
+
+            __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
+            inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
+
+            __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size)));
+            U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
+
+            __m256i row = decode_two_quad32_avx2(inf_u_q, U_q,  &magsgn, p, vn);
+            row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
+            _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
+            _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
+
+            __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
+            w0 = _mm_or_si128(w0, vn);
+            _mm_storeu_si128((__m128i*)vp, w0);
+          }
+        }
+      }
+      else {
+
+        // reduce bitplane by 16 because we now have 16 bits instead of 32
+        p -= 16;
+
+        // We allocate a scratch row for storing v_n values.
+        // We have 512 quads horizontally.
+        // We may go beyond the last entry by up to 8 entries.
+        // Therefore we allocate additional 8 entries.
+        // There are two rows in this structure, the bottom
+        // row is used to store processed entries.
+        const int v_n_size = 512 + 16;
+        ui16 v_n_scratch[v_n_size] = {0}; // 1+ kB
+        ui32 v_n_scratch_32[v_n_size] = {0}; // 2+ kB
+
+        frwd_struct_avx2 magsgn;
+        frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
+
+        {
+          ui16 *sp = scratch;
+          ui16 *vp = v_n_scratch;
+          ui32 *dp = decoded_data;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) {
+              ////process four quads
+              __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
+              __m128i U_q = _mm_srli_epi32(inf_u_q, 16);
+              __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
+              if (!_mm_testz_si128(w, w)) {
+                  return false;
+              }
+
+              __m128i vn = _mm_set1_epi16(2);
+              __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn);
+
+              w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
+              _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
+
+              __m256i  w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
+              __m256i  w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
+
+              _mm256_storeu_si256((__m256i*)dp, w0);
+              _mm256_storeu_si256((__m256i*)(dp + stride), w1);
+          }
+        }
+
+        for (ui32 y = 2; y < height; y += 2) {
+          {
+            // perform 15 - count_leading_zeros(*vp) here
+            ui16 *vp = v_n_scratch;
+            ui32 *vp_32 = v_n_scratch_32;
+
+            ui16* sp = scratch + (y >> 1) * sstr;
+            const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
+            const __m256i avx_31 = _mm256_set1_epi32(31);
+            const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
+            const __m256i avx_1 = _mm256_set1_epi32(1);
+            const __m256i avx_0 = _mm256_setzero_si256();
+
+            for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) {
+              __m128i v = _mm_loadu_si128((__m128i*)vp);
+              __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1));
+              v = _mm_or_si128(v, v_p1);
+
+              __m256i v_avx = _mm256_cvtepu16_epi32(v);
+              v_avx = avx2_lzcnt_epi32(v_avx);
+              v_avx = _mm256_sub_epi32(avx_31, v_avx);
+
+              __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
+              __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
+              __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
+              gamma = _mm256_and_si256(gamma, w0);
+              gamma = _mm256_cmpeq_epi32(gamma, avx_0);
+
+              v_avx = _mm256_andnot_si256(gamma, v_avx);
+              v_avx = _mm256_max_epi32(v_avx, avx_1);
+
+              inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
+              v_avx = _mm256_add_epi32(inf_u_q, v_avx);
+
+              w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2);
+              if (!_mm256_testz_si256(w0, w0)) {
+                  return false;
+              }
+
+              _mm256_storeu_si256((__m256i*)vp_32, v_avx);
+            }
+          }
+
+          ui16 *vp = v_n_scratch;
+          ui32* vp_32 = v_n_scratch_32;
+          ui16 *sp = scratch + (y >> 1) * sstr;
+          ui32 *dp = decoded_data + y * stride;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) {
+            ////process four quads
+              __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
+              __m128i U_q = _mm_loadu_si128((__m128i*)vp_32);
+
+            __m128i vn = _mm_set1_epi16(2);
+            __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn);
+
+            __m128i w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
+            _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
+
+            __m256i  w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
+            __m256i  w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
+
+            _mm256_storeu_si256((__m256i*)dp, w0);
+            _mm256_storeu_si256((__m256i*)(dp + stride), w1);
+          }
+        }
+
+        // increase bitplane back by 16 because we need to process 32 bits
+        p += 16;
+      }
+
+      if (num_passes > 1)
+      {
+        // We use scratch again, we can divide it into multiple regions
+        // sigma holds all the significant samples, and it cannot
+        // be modified after it is set.  it will be used during the
+        // Magnitude Refinement Pass
+        ui16* const sigma = scratch;
+
+        ui32 mstr = (width + 3u) >> 2;   // divide by 4, since each
+                                         // ui16 contains 4 columns
+        mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
+
+        // We re-arrange quad significance, where each 4 consecutive
+        // bits represent one quad, into column significance, where,
+        // each 4 consequtive bits represent one column of 4 rows
+        {
+          ui32 y;
+
+          const __m128i mask_3 = _mm_set1_epi32(0x30);
+          const __m128i mask_C = _mm_set1_epi32(0xC0);
+          const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
+          for (y = 0; y < height; y += 4)
+          {
+            ui16* sp = scratch + (y >> 1) * sstr;
+            ui16* dp = sigma + (y >> 2) * mstr;
+            for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
+            {
+              __m128i s0, s1, u3, uC, t0, t1;
+
+              s0 = _mm_loadu_si128((__m128i*)(sp));
+              u3 = _mm_and_si128(s0, mask_3);
+              u3 = _mm_srli_epi32(u3, 4);
+              uC = _mm_and_si128(s0, mask_C);
+              uC = _mm_srli_epi32(uC, 2);
+              t0 = _mm_or_si128(u3, uC);
+
+              s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
+              u3 = _mm_and_si128(s1, mask_3);
+              u3 = _mm_srli_epi32(u3, 2);
+              uC = _mm_and_si128(s1, mask_C);
+              t1 = _mm_or_si128(u3, uC);
+
+              __m128i r = _mm_or_si128(t0, t1);
+              r = _mm_shuffle_epi8(r, shuffle_mask);
+
+              *(ui32*)dp = (ui32)_mm_extract_epi32(r, 0);
+            }
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+          {
+            // reset one row after the codeblock
+            ui16* dp = sigma + (y >> 2) * mstr;
+            __m128i zero = _mm_setzero_si128();
+            for (ui32 x = 0; x < width; x += 32, dp += 8)
+              _mm_storeu_si128((__m128i*)dp, zero);
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+        }
+
+        // We perform Significance Propagation Pass here
+        {
+          // This stores significance information of the previous
+          // 4 rows.  Significance information in this array includes
+          // all signicant samples in bitplane p - 1; that is,
+          // significant samples for bitplane p (discovered during the
+          // cleanup pass and stored in sigma) and samples that have recently
+          // became significant (during the SPP) in bitplane p-1.
+          // We store enough for the widest row, containing 1024 columns,
+          // which is equivalent to 256 of ui16, since each stores 4 columns.
+          // We add an extra 8 entries, just in case we need more
+          ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
+
+          frwd_struct_avx2 sigprop;
+          frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui32 pattern = 0xFFFFu; // a pattern needed samples
+            if (height - y < 4) {
+              pattern = 0x7777u;
+              if (height - y < 3) {
+                pattern = 0x3333u;
+                if (height - y < 2)
+                  pattern = 0x1111u;
+              }
+            }
+
+            // prev holds sign. info. for the previous quad, together
+            // with the rows on top of it and below it.
+            ui32 prev = 0;
+            ui16 *prev_sig = prev_row_sig;
+            ui16 *cur_sig = sigma + (y >> 2) * mstr;
+            ui32 *dpp = decoded_data + y * stride;
+            for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
+            {
+              // only rows and columns inside the stripe are included
+              si32 s = (si32)x + 4 - (si32)width;
+              s = ojph_max(s, 0);
+              pattern = pattern >> (s * 4);
+
+              // We first find locations that need to be tested (potential
+              // SPP members); these location will end up in mbr
+              // In each iteration, we produce 16 bits because cwd can have
+              // up to 16 bits of significance information, followed by the
+              // corresponding 16 bits of sign information; therefore, it is
+              // sufficient to fetch 32 bit data per loop.
+
+              // Althougth we are interested in 16 bits only, we load 32 bits.
+              // For the 16 bits we are producing, we need the next 4 bits --
+              // We need data for at least 5 columns out of 8.
+              // Therefore loading 32 bits is easier than loading 16 bits
+              // twice.
+              ui32 ps = *(ui32*)prev_sig;
+              ui32 ns = *(ui32*)(cur_sig + mstr);
+              ui32 u = (ps & 0x88888888) >> 3; // the row on top
+              if (!stripe_causal)
+                u |= (ns & 0x11111111) << 3;   // the row below
+
+              ui32 cs = *(ui32*)cur_sig;
+              // vertical integration
+              ui32 mbr =  cs;                // this sig. info.
+              mbr |= (cs & 0x77777777) << 1; //above neighbors
+              mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
+              mbr |= u;
+              // horizontal integration
+              ui32 t = mbr;
+              mbr |= t << 4;      // neighbors on the left
+              mbr |= t >> 4;      // neighbors on the right
+              mbr |= prev >> 12;  // significance of previous group
+
+              // remove outside samples, and already significant samples
+              mbr &= pattern;
+              mbr &= ~cs;
+
+              // find samples that become significant during the SPP
+              ui32 new_sig = mbr;
+              if (new_sig)
+              {
+                __m128i cwd_vec = frwd_fetch<0>(&sigprop);
+                ui32 cwd = (ui32)_mm_extract_epi16(cwd_vec, 0);
+
+                ui32 cnt = 0;
+                ui32 col_mask = 0xFu;
+                ui32 inv_sig = ~cs & pattern;
+                for (int i = 0; i < 16; i += 4, col_mask <<= 4)
+                {
+                  if ((col_mask & new_sig) == 0)
+                    continue;
+
+                  //scan one column
+                  ui32 sample_mask = 0x1111u & col_mask;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x33u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x76u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xECu << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xC8u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+                }
+
+                if (new_sig)
+                {
+                  cwd |= (ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt);
+
+                  // Spread new_sig, such that each bit is in one byte with a
+                  // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
+                  __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig);
+                  new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
+                    _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
+                  new_sig_vec = _mm_and_si128(new_sig_vec,
+                    _mm_set1_epi64x((si64)0x8040201008040201));
+                  new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
+                    _mm_set1_epi64x((si64)0x8040201008040201));
+
+                  // find cumulative sums
+                  // to find which bit in cwd we should extract
+                  __m128i inc_sum = new_sig_vec; // inclusive scan
+                  inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1
+                  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
+                  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
+                  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
+                  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
+                  cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
+                  // exclusive scan
+                  __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
+
+                  // Spread cwd, such that each bit is in one byte
+                  // with a value of 0 or 1.
+                  cwd_vec = _mm_set1_epi16((si16)cwd);
+                  cwd_vec = _mm_shuffle_epi8(cwd_vec,
+                    _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
+                  cwd_vec = _mm_and_si128(cwd_vec,
+                    _mm_set1_epi64x((si64)0x8040201008040201));
+                  cwd_vec = _mm_cmpeq_epi8(cwd_vec,
+                    _mm_set1_epi64x((si64)0x8040201008040201));
+                  cwd_vec = _mm_abs_epi8(cwd_vec);
+
+                  // Obtain bit from cwd_vec correspondig to ex_sum
+                  // Basically, collect needed bits from cwd_vec
+                  __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
+
+                  // load data and set spp coefficients
+                  __m128i m =
+                    _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
+                  __m128i val = _mm_set1_epi32(3 << (p - 2));
+                  ui32 *dp = dpp;
+                  for (int c = 0; c < 4; ++ c) {
+                    __m128i s0, s0_ns, s0_val;
+                    // load coefficients
+                    s0 = _mm_load_si128((__m128i*)dp);
+
+                    // epi32 is -1 only for coefficient that
+                    // are changed during the SPP
+                    s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
+                    s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
+
+                    // obtain sign for coefficients in SPP
+                    s0_val = _mm_shuffle_epi8(v, m);
+                    s0_val = _mm_slli_epi32(s0_val, 31);
+                    s0_val = _mm_or_si128(s0_val, val);
+                    s0_val = _mm_and_si128(s0_val, s0_ns);
+
+                    // update vector
+                    s0 = _mm_or_si128(s0, s0_val);
+                    // store coefficients
+                    _mm_store_si128((__m128i*)dp, s0);
+                    // prepare for next row
+                    dp += stride;
+                    m = _mm_add_epi32(m, _mm_set1_epi32(1));
+                  }
+                }
+                frwd_advance(&sigprop, cnt);
+              }
+
+              new_sig |= cs;
+              *prev_sig = (ui16)(new_sig);
+
+              // vertical integration for the new sig. info.
+              t = new_sig;
+              new_sig |= (t & 0x7777) << 1; //above neighbors
+              new_sig |= (t & 0xEEEE) >> 1; //below neighbors
+              // add sig. info. from the row on top and below
+              prev = new_sig | u;
+              // we need only the bits in 0xF000
+              prev &= 0xF000;
+            }
+          }
+        }
+
+        // We perform Magnitude Refinement Pass here
+        if (num_passes > 2)
+        {
+          rev_struct magref;
+          rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui16 *cur_sig = sigma + (y >> 2) * mstr;
+            ui32 *dpp = decoded_data + y * stride;
+            for (ui32 i = 0; i < width; i += 4, dpp += 4)
+            {
+              //Process one entry from sigma array at a time
+              // Each nibble (4 bits) in the sigma array represents 4 rows,
+              ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
+              ui16 sig = *cur_sig++; // 16 bit that will be processed now
+              int total_bits = 0;
+              if (sig) // if any of the 32 bits are set
+              {
+                // We work on 4 rows, with 4 samples each, since
+                // data is 32 bit (4 bytes)
+
+                // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
+                __m128i sig_vec = _mm_set1_epi16((si16)sig);
+                sig_vec = _mm_shuffle_epi8(sig_vec,
+                  _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
+                sig_vec = _mm_and_si128(sig_vec,
+                  _mm_set1_epi64x((si64)0x8040201008040201));
+                sig_vec = _mm_cmpeq_epi8(sig_vec,
+                  _mm_set1_epi64x((si64)0x8040201008040201));
+                sig_vec = _mm_abs_epi8(sig_vec);
+
+                // find cumulative sums
+                // to find which bit in cwd we should extract
+                __m128i inc_sum = sig_vec; // inclusive scan
+                inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
+                inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
+                inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
+                inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
+                total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
+                __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan
+
+                // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
+                // cwd_vec. Then, convert these to a form suitable
+                // for coefficient modifications; in particular, a value
+                // of 0 is presented as binary 11, and a value of 1 is
+                // represented as binary 01
+                __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
+                cwd_vec = _mm_shuffle_epi8(cwd_vec,
+                  _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
+                cwd_vec = _mm_and_si128(cwd_vec,
+                  _mm_set1_epi64x((si64)0x8040201008040201));
+                cwd_vec = _mm_cmpeq_epi8(cwd_vec,
+                  _mm_set1_epi64x((si64)0x8040201008040201));
+                cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
+                cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
+                cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
+
+                // load data and insert the mrp bit
+                __m128i m =
+                  _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
+                ui32 *dp = dpp;
+                for (int c = 0; c < 4; ++c) {
+                  __m128i s0, s0_sig, s0_idx, s0_val;
+                  // load coefficients
+                  s0 = _mm_load_si128((__m128i*)dp);
+                  // find significant samples in this row
+                  s0_sig = _mm_shuffle_epi8(sig_vec, m);
+                  s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
+                  // get MRP bit index, and MRP pattern
+                  s0_idx = _mm_shuffle_epi8(ex_sum, m);
+                  s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
+                  // keep data from significant samples only
+                  s0_val = _mm_andnot_si128(s0_sig, s0_val);
+                  // move mrp bits to correct position, and employ
+                  s0_val = _mm_slli_epi32(s0_val, (si32)p - 2);
+                  s0 = _mm_xor_si128(s0, s0_val);
+                  // store coefficients
+                  _mm_store_si128((__m128i*)dp, s0);
+                  // prepare for next row
+                  dp += stride;
+                  m = _mm_add_epi32(m, _mm_set1_epi32(1));
+                }
+              }
+              // consume data according to the number of bits set
+              rev_advance_mrp(&magref, (ui32)total_bits);
+            }
+          }
+        }
+      }
+
+      return true;
+    }
+  }
+}
+
+#endif
diff --git a/src/core/coding/ojph_block_decoder_ssse3.cpp b/src/core/coding/ojph_block_decoder_ssse3.cpp
index 3873fd01..3e3e00e4 100644
--- a/src/core/coding/ojph_block_decoder_ssse3.cpp
+++ b/src/core/coding/ojph_block_decoder_ssse3.cpp
@@ -40,6 +40,9 @@
  *  @brief implements a faster HTJ2K block decoder using ssse3
  */
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
 #include <string>
 #include <iostream>
 
@@ -47,7 +50,6 @@
 #include <cstring>
 #include "ojph_block_common.h"
 #include "ojph_block_decoder.h"
-#include "ojph_arch.h"
 #include "ojph_message.h"
 
 #include <immintrin.h>
@@ -66,7 +68,7 @@ namespace ojph {
       dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
         k(0), num_runs(0), runs(0)
       {}
-      // data decoding machinary
+      // data decoding machinery
       ui8* data;    //!<the address of data (or bitstream)
       ui64 tmp;     //!<temporary buffer for read data
       int bits;     //!<number of bits stored in tmp
@@ -507,7 +509,7 @@ namespace ojph {
      *         an architecture that read size must be compatible with the
      *         alignment of the read address
      *
-     *  There is another simiar subroutine rev_init.  This subroutine does 
+     *  There is another similar subroutine rev_init.  This subroutine does 
      *  NOT skip the first 12 bits, and starts with unstuff set to true.
      *
      *  @param [in]  mrp is a pointer to rev_struct structure
@@ -579,7 +581,7 @@ namespace ojph {
     /** @brief State structure for reading and unstuffing of forward-growing 
      *         bitstreams; these are: MagSgn and SPP bitstreams
      */
-    struct frwd_struct {
+    struct frwd_struct_ssse3 {
       const ui8* data;  //!<pointer to bitstream
       ui8 tmp[48];      //!<temporary buffer of read data + 16 extra
       ui32 bits;        //!<number of bits stored in tmp
@@ -596,18 +598,18 @@ namespace ojph {
      *  X controls this value.
      *
      *  Unstuffing prevent sequences that are more than 0xFF7F from appearing
-     *  in the conpressed sequence.  So whenever a value of 0xFF is coded, the
+     *  in the compressed sequence.  So whenever a value of 0xFF is coded, the
      *  MSB of the next byte is set 0 and must be ignored during decoding.
      *
      *  Reading can go beyond the end of buffer by up to 16 bytes.
      *
      *  @tparam       X is the value fed in when the bitstream is exhausted
-     *  @param  [in]  msp is a pointer to frwd_struct structure
+     *  @param  [in]  msp is a pointer to frwd_struct_ssse3 structure
      *
      */
     template<int X>
     static inline 
-    void frwd_read(frwd_struct *msp)
+    void frwd_read(frwd_struct_ssse3 *msp)
     {
       assert(msp->bits <= 128);
 
@@ -686,17 +688,17 @@ namespace ojph {
     }
 
     //************************************************************************/
-    /** @brief Initialize frwd_struct struct and reads some bytes
+    /** @brief Initialize frwd_struct_ssse3 struct and reads some bytes
      *  
      *  @tparam      X is the value fed in when the bitstream is exhausted.
      *               See frwd_read regarding the template
-     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  msp is a pointer to frwd_struct_ssse3
      *  @param [in]  data is a pointer to the start of data
      *  @param [in]  size is the number of byte in the bitstream
      */
     template<int X>
     static inline 
-    void frwd_init(frwd_struct *msp, const ui8* data, int size)
+    void frwd_init(frwd_struct_ssse3 *msp, const ui8* data, int size)
     {
       msp->data = data;
       _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128());
@@ -711,13 +713,13 @@ namespace ojph {
     }
 
     //************************************************************************/
-    /** @brief Consume num_bits bits from the bitstream of frwd_struct
+    /** @brief Consume num_bits bits from the bitstream of frwd_struct_ssse3
      *
-     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  msp is a pointer to frwd_struct_ssse3
      *  @param [in]  num_bits is the number of bit to consume
      */
     static inline 
-    void frwd_advance(frwd_struct *msp, ui32 num_bits)
+    void frwd_advance(frwd_struct_ssse3 *msp, ui32 num_bits)
     {
       assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
       msp->bits -= num_bits;
@@ -749,15 +751,15 @@ namespace ojph {
     }
 
     //************************************************************************/
-    /** @brief Fetches 32 bits from the frwd_struct bitstream
+    /** @brief Fetches 32 bits from the frwd_struct_ssse3 bitstream
      *
      *  @tparam      X is the value fed in when the bitstream is exhausted.
      *               See frwd_read regarding the template
-     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  msp is a pointer to frwd_struct_ssse3
      */
     template<int X>
     static inline
-    __m128i frwd_fetch(frwd_struct *msp)
+    __m128i frwd_fetch(frwd_struct_ssse3 *msp)
     {
       if (msp->bits <= 128)
       {
@@ -784,7 +786,7 @@ namespace ojph {
     template <int N>
     static inline 
     __m128i decode_one_quad32(const __m128i inf_u_q, __m128i U_q,
-                              frwd_struct* magsgn, ui32 p, __m128i& vn)
+                              frwd_struct_ssse3* magsgn, ui32 p, __m128i& vn)
     {
       __m128i w0;    // workers
       __m128i insig; // lanes hold FF's if samples are insignificant
@@ -894,7 +896,7 @@ namespace ojph {
      */
     static inline 
     __m128i decode_two_quad16(const __m128i inf_u_q, __m128i U_q, 
-                              frwd_struct* magsgn, ui32 p, __m128i& vn)
+                              frwd_struct_ssse3* magsgn, ui32 p, __m128i& vn)
     {
       __m128i w0;     // workers
       __m128i insig;  // lanes hold FF's if samples are insignificant
@@ -1033,14 +1035,14 @@ namespace ojph {
       {
         OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
                               "one coding pass, but zero length for "
-                              "2nd and potential 3rd pass.\n");
+                              "2nd and potential 3rd pass.");
         num_passes = 1;
       }
 
       if (num_passes > 3)
       {
         OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
-                              "This codeblocks has %d passes.\n",
+                              "This codeblocks has %d passes.",
                               num_passes);
         return false;
       }
@@ -1052,7 +1054,7 @@ namespace ojph {
           insufficient_precision = true;
           OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
                                 "codeblock. This message will not be "
-                                "displayed again.\n");
+                                "displayed again.");
         }
         return false;
       }       
@@ -1063,7 +1065,7 @@ namespace ojph {
           OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
                                 "pass. The code can be modified to support "
                                 "this case. This message will not be "
-                                "displayed again.\n");
+                                "displayed again.");
         }
          return false;         // 32 bits are not enough to decode this
        }
@@ -1076,7 +1078,7 @@ namespace ojph {
             OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
                                   "nor MagRef passes; both will be skipped. "
                                   "This message will not be displayed "
-                                  "again.\n");
+                                  "again.");
           }
         }
       }
@@ -1086,7 +1088,7 @@ namespace ojph {
 
       if (lengths1 < 2)
       {
-        OJPH_WARN(0x00010006, "Wrong codeblock length.\n");
+        OJPH_WARN(0x00010006, "Wrong codeblock length.");
         return false;
       }
 
@@ -1361,7 +1363,7 @@ namespace ojph {
             // quad 0 length
             len = uvlc_entry & 0x7; // quad 0 suffix length
             uvlc_entry >>= 3;
-            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q
+            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
             sp[1] = u_q;
             u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
             sp[3] = u_q;
@@ -1389,7 +1391,7 @@ namespace ojph {
         const int v_n_size = 512 + 8;
         ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
 
-        frwd_struct magsgn;
+        frwd_struct_ssse3 magsgn;
         frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
 
         {
@@ -1540,7 +1542,7 @@ namespace ojph {
         const int v_n_size = 512 + 8;
         ui16 v_n_scratch[2 * v_n_size] = {0}; // 2+ kB
 
-        frwd_struct magsgn;
+        frwd_struct_ssse3 magsgn;
         frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
 
         {
@@ -1725,8 +1727,8 @@ namespace ojph {
               __m128i r = _mm_or_si128(t0, t1);
               r = _mm_shuffle_epi8(r, shuffle_mask);
 
-              // _mm_storeu_si32 is not defined, so we use this workaround
-              _mm_store_ss((float*)dp, _mm_castsi128_ps(r));
+              dp[0] = (ui16)_mm_extract_epi16(r, 0);
+              dp[1] = (ui16)_mm_extract_epi16(r, 1);
             }
             dp[0] = 0; // set an extra entry on the right with 0
           }
@@ -1735,7 +1737,7 @@ namespace ojph {
             ui16* dp = sigma + (y >> 2) * mstr;
             __m128i zero = _mm_setzero_si128();
             for (ui32 x = 0; x < width; x += 32, dp += 8)
-              _mm_store_si128((__m128i*)dp, zero);
+              _mm_storeu_si128((__m128i*)dp, zero);
             dp[0] = 0; // set an extra entry on the right with 0
           }
         }
@@ -1753,7 +1755,7 @@ namespace ojph {
           // We add an extra 8 entries, just in case we need more
           ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
 
-          frwd_struct sigprop;
+          frwd_struct_ssse3 sigprop;
           frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
 
           for (ui32 y = 0; y < height; y += 4)
@@ -2064,3 +2066,5 @@ namespace ojph {
     }
   }
 }
+
+#endif
diff --git a/src/core/coding/ojph_block_decoder_wasm.cpp b/src/core/coding/ojph_block_decoder_wasm.cpp
index 6b30cdb3..42a04b58 100644
--- a/src/core/coding/ojph_block_decoder_wasm.cpp
+++ b/src/core/coding/ojph_block_decoder_wasm.cpp
@@ -74,7 +74,7 @@ namespace ojph {
       dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
         k(0), num_runs(0), runs(0)
       {}
-      // data decoding machinary
+      // data decoding machinery
       ui8* data;    //!<the address of data (or bitstream)
       ui64 tmp;     //!<temporary buffer for read data
       int bits;     //!<number of bits stored in tmp
@@ -515,7 +515,7 @@ namespace ojph {
      *         an architecture that read size must be compatible with the
      *         alignment of the read address
      *
-     *  There is another simiar subroutine rev_init.  This subroutine does 
+     *  There is another similar subroutine rev_init.  This subroutine does 
      *  NOT skip the first 12 bits, and starts with unstuff set to true.
      *
      *  @param [in]  mrp is a pointer to rev_struct structure
diff --git a/src/core/coding/ojph_block_encoder.cpp b/src/core/coding/ojph_block_encoder.cpp
index 2023ef19..019f4f1f 100644
--- a/src/core/coding/ojph_block_encoder.cpp
+++ b/src/core/coding/ojph_block_encoder.cpp
@@ -44,6 +44,7 @@
 #include <cstring>
 #include <cstdint>
 #include <climits>
+#include <mutex>
 
 #include "ojph_mem.h"
 #include "ojph_arch.h"
@@ -65,11 +66,12 @@ namespace ojph {
     static ui16 vlc_tbl1[2048] = { 0 };
 
     //UVLC encoding
-    static int ulvc_cwd_pre[33];
-    static int ulvc_cwd_pre_len[33];
-    static int ulvc_cwd_suf[33];
-    static int ulvc_cwd_suf_len[33];
-
+    const int num_uvlc_entries = 75;
+    struct uvlc_tbl_struct {
+      ui8 pre, pre_len, suf, suf_len, ext, ext_len;
+    };
+    static uvlc_tbl_struct uvlc_tbl[num_uvlc_entries];
+    
     /////////////////////////////////////////////////////////////////////////
     static bool vlc_init_tables()
     {
@@ -194,29 +196,76 @@ namespace ojph {
     static bool uvlc_init_tables()
     {
       //code goes from 0 to 31, extension and 32 are not supported here
-      ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
-      ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
-      ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
-      ulvc_cwd_pre_len[2] = 2;
-      ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
-      ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
-      ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
-      ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
-      ulvc_cwd_suf_len[2] = 0;
-      ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
+      uvlc_tbl[0].pre = 0;
+      uvlc_tbl[0].pre_len = 0;
+      uvlc_tbl[0].suf = 0;
+      uvlc_tbl[0].suf_len = 0;
+      uvlc_tbl[0].ext = 0;
+      uvlc_tbl[0].ext_len = 0;
+
+      uvlc_tbl[1].pre = 1;
+      uvlc_tbl[1].pre_len = 1;
+      uvlc_tbl[1].suf = 0;
+      uvlc_tbl[1].suf_len = 0;
+      uvlc_tbl[1].ext = 0;
+      uvlc_tbl[1].ext_len = 0;
+
+      uvlc_tbl[2].pre = 2;
+      uvlc_tbl[2].pre_len = 2;
+      uvlc_tbl[2].suf = 0;
+      uvlc_tbl[2].suf_len = 0;
+      uvlc_tbl[2].ext = 0;
+      uvlc_tbl[2].ext_len = 0;
+
+      uvlc_tbl[3].pre = 4;
+      uvlc_tbl[3].pre_len = 3;
+      uvlc_tbl[3].suf = 0;
+      uvlc_tbl[3].suf_len = 1;
+      uvlc_tbl[3].ext = 0;
+      uvlc_tbl[3].ext_len = 0;
+
+      uvlc_tbl[4].pre = 4;
+      uvlc_tbl[4].pre_len = 3;
+      uvlc_tbl[4].suf = 1;
+      uvlc_tbl[4].suf_len = 1;
+      uvlc_tbl[4].ext = 0;
+      uvlc_tbl[4].ext_len = 0;
+
       for (int i = 5; i < 33; ++i)
       {
-        ulvc_cwd_pre[i] = 0;
-        ulvc_cwd_pre_len[i] = 3;
-        ulvc_cwd_suf[i] = i-5;
-        ulvc_cwd_suf_len[i] = 5;
+        uvlc_tbl[i].pre = 0;
+        uvlc_tbl[i].pre_len = 3;
+        uvlc_tbl[i].suf = (ui8)(i - 5);
+        uvlc_tbl[i].suf_len = 5;
+        uvlc_tbl[i].ext = 0;
+        uvlc_tbl[i].ext_len = 0;
       }
+
+      for (int i = 33; i < num_uvlc_entries; ++i)
+      {
+        uvlc_tbl[i].pre = 0;
+        uvlc_tbl[i].pre_len = 3;
+        uvlc_tbl[i].suf = (ui8)(28 + (i - 33) % 4);
+        uvlc_tbl[i].suf_len = 5;
+        uvlc_tbl[i].ext = (ui8)((i - 33) / 4);
+        uvlc_tbl[i].ext_len = 4;
+      }
+
       return true;
     }
 
     /////////////////////////////////////////////////////////////////////////
-    static bool vlc_tables_initialized = vlc_init_tables();
-    static bool uvlc_tables_initialized = uvlc_init_tables();
+    bool initialize_block_encoder_tables() {
+      static bool tables_initialized = false;
+      static std::once_flag tables_initialized_flag;
+      std::call_once(tables_initialized_flag, []() {
+        memset(vlc_tbl0, 0, 2048 * sizeof(ui16));
+        memset(vlc_tbl1, 0, 2048 * sizeof(ui16));
+        tables_initialized = vlc_init_tables();
+        tables_initialized = tables_initialized && uvlc_init_tables();
+      });
+      return tables_initialized;
+    }
 
     /////////////////////////////////////////////////////////////////////////
     //
@@ -440,6 +489,29 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    ms_encode64(ms_struct* msp, ui64 cwd, int cwd_len)
+    {
+      while (cwd_len > 0)
+      {
+        if (msp->pos >= msp->buf_size)
+          OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
+        int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
+        msp->tmp |= (ui32)((cwd & ((1ULL << t) - 1)) << msp->used_bits);
+        msp->used_bits += t;
+        cwd >>= t;
+        cwd_len -= t;
+        if (msp->used_bits >= msp->max_bits)
+        {
+          msp->buf[msp->pos++] = (ui8)msp->tmp;
+          msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
+          msp->tmp = 0;
+          msp->used_bits = 0;
+        }
+      }
+    }    
+
     //////////////////////////////////////////////////////////////////////////
     static inline void
     ms_terminate(ms_struct* msp)
@@ -467,11 +539,11 @@ namespace ojph {
     //
     //
     //////////////////////////////////////////////////////////////////////////
-    void ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes,
-                               ui32 width, ui32 height, ui32 stride,
-                               ui32* lengths,
-                               ojph::mem_elastic_allocator *elastic,
-                               ojph::coded_lists *& coded)
+    void ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 ui32* lengths,
+                                 ojph::mem_elastic_allocator *elastic,
+                                 ojph::coded_lists *& coded)
     {
       assert(num_passes == 1);
       (void)num_passes;                      //currently not used
@@ -693,23 +765,23 @@ namespace ojph {
 
         if (u_q0 > 2 && u_q1 > 2)
         {
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0-2], ulvc_cwd_pre_len[u_q0-2]);
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q1-2], ulvc_cwd_pre_len[u_q1-2]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0-2], ulvc_cwd_suf_len[u_q0-2]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q1-2], ulvc_cwd_suf_len[u_q1-2]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len);
         }
         else if (u_q0 > 2 && u_q1 > 0)
         {
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
           vlc_encode(&vlc, u_q1 - 1, 1);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
         }
         else
         {
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q1], ulvc_cwd_pre_len[u_q1]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q1], ulvc_cwd_suf_len[u_q1]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
         }
 
         //prepare for next iteration
@@ -910,10 +982,514 @@ namespace ojph {
             ms_encode(&ms, s[7] & ((1U<<m)-1), m);
           }
 
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q1], ulvc_cwd_pre_len[u_q1]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q1], ulvc_cwd_suf_len[u_q1]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
+
+          //prepare for next iteration
+          c_q0 |= ((rho[1] & 4) >> 1) | ((rho[1] & 8) >> 2);
+          s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0;
+          e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0;
+          rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0;
+        }
+      }
+
+
+      terminate_mel_vlc(&mel, &vlc);
+      ms_terminate(&ms);
+
+      //copy to elastic
+      lengths[0] = mel.pos + vlc.pos + ms.pos;
+      elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
+      memcpy(coded->buf, ms.buf, ms.pos);
+      memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
+      memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
+
+      // put in the interface locator word
+      ui32 num_bytes = mel.pos + vlc.pos;
+      coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
+      coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
+      coded->buf[lengths[0]-2] = 
+        (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
+
+      coded->avail_size -= lengths[0];
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
+    void ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 ui32* lengths,
+                                 ojph::mem_elastic_allocator *elastic,
+                                 ojph::coded_lists *& coded)
+    {
+      assert(num_passes == 1);
+      (void)num_passes;                      //currently not used
+      // 38 bits/sample + 1 color + 4 wavelet = 43 bits per sample.
+      // * 4096 samples / 8 bits per byte = 22016; then rounded up to the 
+      // nearest 1 kB, givin 22528.  This expanded further to take into 
+      // consideration stuffing at a max rate of 16 bits per 15 bits 
+      // (1 bit for every 15 bits of data); in reality, it is much smaller
+      // than this.
+      const int ms_size = (22528 * 16 + 14) / 15;  //more than enough
+      ui8 ms_buf[ms_size];
+      // For each quad, we need at most, 7 bits for VLC and 12 bits for UVLC.
+      // So we have 1024 quads * 19 / 8, which is 2432.  This must be 
+      // multiplied by 16 / 15 to accommodate stuffing.  
+      // The mel is at most around 1 bit/quad, giving around 128 byte -- in
+      // practice there was on case where it got to 132 bytes.  Even 
+      // accounting for stuffing, it is smaller than 192.  Therefore,
+      // 3072 is more than enough
+      const int mel_vlc_size = 3072;         //more than enough
+      ui8 mel_vlc_buf[mel_vlc_size];
+      const int mel_size = 192;
+      ui8 *mel_buf = mel_vlc_buf;
+      const int vlc_size = mel_vlc_size - mel_size;
+      ui8 *vlc_buf = mel_vlc_buf + mel_size;
+
+      mel_struct mel;
+      mel_init(&mel, mel_size, mel_buf);
+      vlc_struct vlc;
+      vlc_init(&vlc, vlc_size, vlc_buf);
+      ms_struct ms;
+      ms_init(&ms, ms_size, ms_buf);
+
+      ui32 p = 62 - missing_msbs;
+
+      //e_val: E values for a line (these are the highest set bit)
+      //cx_val: is the context values
+      //Each byte stores the info for the 2 sample. For E, it is maximum
+      // of the two samples, while for cx, it is the OR of these two samples.
+      //The maximum is between the pixel at the bottom left of one quad
+      // and the bottom right of the earlier quad. The same is true for cx.
+      //For a 1024 pixels, we need 512 bytes, the 2 extra,
+      // one for the non-existing earlier quad, and one for beyond the
+      // the end
+      ui8 e_val[513];
+      ui8 cx_val[513];
+      ui8* lep = e_val;     lep[0] = 0;
+      ui8* lcxp = cx_val;   lcxp[0] = 0;
+
+      //initial row of quads
+      int e_qmax[2] = {0,0}, e_q[8] = {0,0,0,0,0,0,0,0};
+      int rho[2] = {0,0};
+      int c_q0 = 0;
+      ui64 s[8] = {0,0,0,0,0,0,0,0}, val, t;
+      ui32 y = 0;
+      ui64 *sp = buf;
+      for (ui32 x = 0; x < width; x += 4)
+      {
+        //prepare two quads
+        t = sp[0];
+        val = t + t; //multiply by 2 and get rid of sign
+        val >>= p;  // 2 \mu_p + x
+        val &= ~1ULL; // 2 \mu_p
+        if (val)
+        {
+          rho[0] = 1;
+          e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+          e_qmax[0] = e_q[0];
+          s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+        }
+
+        t = height > 1 ? sp[stride] : 0;
+        ++sp;
+        val = t + t; //multiply by 2 and get rid of sign
+        val >>= p; // 2 \mu_p + x
+        val &= ~1ULL;// 2 \mu_p
+        if (val)
+        {
+          rho[0] += 2;
+          e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+          e_qmax[0] = ojph_max(e_qmax[0], e_q[1]);
+          s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+        }
+
+        if (x + 1 < width)
+        {
+          t = sp[0];
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] += 4;
+            e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = ojph_max(e_qmax[0], e_q[2]);
+            s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          t = height > 1 ? sp[stride] : 0;
+          ++sp;
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] += 8;
+            e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = ojph_max(e_qmax[0], e_q[3]);
+            s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+        }
+
+        int Uq0 = ojph_max(e_qmax[0], 1); //kappa_q = 1
+        int u_q0 = Uq0 - 1, u_q1 = 0; //kappa_q = 1
+
+        int eps0 = 0;
+        if (u_q0 > 0)
+        {
+          eps0 |= (e_q[0] == e_qmax[0]);
+          eps0 |= (e_q[1] == e_qmax[0]) << 1;
+          eps0 |= (e_q[2] == e_qmax[0]) << 2;
+          eps0 |= (e_q[3] == e_qmax[0]) << 3;
+        }
+        lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
+        lep[0] = (ui8)e_q[3];
+        lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
+        lcxp[0] = (ui8)((rho[0] & 8) >> 3);
+
+        ui16 tuple0 = vlc_tbl0[(c_q0 << 8) + (rho[0] << 4) + eps0];
+        vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7);
+
+        if (c_q0 == 0)
+          mel_encode(&mel, rho[0] != 0);
+
+        int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0;
+        ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m);
+        m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0;
+        ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m);
+        m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0;
+        ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m);
+        m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0;
+        ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m);
+
+        if (x + 2 < width)
+        {
+          t = sp[0];
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[1] = 1;
+            e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[1] = e_q[4];
+            s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          t = height > 1 ? sp[stride] : 0;
+          ++sp;
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[1] += 2;
+            e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[1] = ojph_max(e_qmax[1], e_q[5]);
+            s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          if (x + 3 < width)
+          {
+            t = sp[0];
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] += 4;
+              e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = ojph_max(e_qmax[1], e_q[6]);
+              s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            t = height > 1 ? sp[stride] : 0;
+            ++sp;
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] += 8;
+              e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = ojph_max(e_qmax[1], e_q[7]);
+              s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+          }
+
+          int c_q1 = (rho[0] >> 1) | (rho[0] & 1);
+          int Uq1 = ojph_max(e_qmax[1], 1); //kappa_q = 1
+          u_q1 = Uq1 - 1; //kappa_q = 1
+
+          int eps1 = 0;
+          if (u_q1 > 0)
+          {
+            eps1 |= (e_q[4] == e_qmax[1]);
+            eps1 |= (e_q[5] == e_qmax[1]) << 1;
+            eps1 |= (e_q[6] == e_qmax[1]) << 2;
+            eps1 |= (e_q[7] == e_qmax[1]) << 3;
+          }
+          lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++;
+          lep[0] = (ui8)e_q[7];
+          lcxp[0] |= (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++;
+          lcxp[0] = (ui8)((rho[1] & 8) >> 3);
+          ui16 tuple1 = vlc_tbl0[(c_q1 << 8) + (rho[1] << 4) + eps1];
+          vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7);
+
+          if (c_q1 == 0)
+            mel_encode(&mel, rho[1] != 0);
+
+          int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0;
+          ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m);
+          m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0;
+          ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m);
+          m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0;
+          ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m);
+          m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0;
+          ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m);
+        }
+
+        if (u_q0 > 0 && u_q1 > 0)
+          mel_encode(&mel, ojph_min(u_q0, u_q1) > 2);
+
+        if (u_q0 > 2 && u_q1 > 2)
+        {
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].ext, uvlc_tbl[u_q0-2].ext_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].ext, uvlc_tbl[u_q1-2].ext_len);
+        }
+        else if (u_q0 > 2 && u_q1 > 0)
+        {
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, u_q1 - 1, 1);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len);
+        }
+        else
+        {
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len);
+        }
+
+        //prepare for next iteration
+        c_q0 = (rho[1] >> 1) | (rho[1] & 1);
+        s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0;
+        e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0;
+        rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0;
+      }
+
+      lep[1] = 0;
+
+      for (y = 2; y < height; y += 2)
+      {
+        lep = e_val;
+        int max_e = ojph_max(lep[0], lep[1]) - 1;
+        lep[0] = 0;
+        lcxp = cx_val;
+        c_q0 = lcxp[0] + (lcxp[1] << 2);
+        lcxp[0] = 0;
+
+        sp = buf + y * stride;
+        for (ui32 x = 0; x < width; x += 4)
+        {
+          //prepare two quads
+          t = sp[0];
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] = 1;
+            e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = e_q[0];
+            s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          t = y + 1 < height ? sp[stride] : 0;
+          ++sp;
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] += 2;
+            e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = ojph_max(e_qmax[0], e_q[1]);
+            s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          if (x + 1 < width)
+          {
+            t = sp[0];
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[0] += 4;
+              e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[0] = ojph_max(e_qmax[0], e_q[2]);
+              s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            t = y + 1 < height ? sp[stride] : 0;
+            ++sp;
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[0] += 8;
+              e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[0] = ojph_max(e_qmax[0], e_q[3]);
+              s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+          }
+
+          int kappa = (rho[0] & (rho[0]-1)) ? ojph_max(1,max_e) : 1;
+          int Uq0 = ojph_max(e_qmax[0], kappa);
+          int u_q0 = Uq0 - kappa, u_q1 = 0;
+
+          int eps0 = 0;
+          if (u_q0 > 0)
+          {
+            eps0 |= (e_q[0] == e_qmax[0]);
+            eps0 |= (e_q[1] == e_qmax[0]) << 1;
+            eps0 |= (e_q[2] == e_qmax[0]) << 2;
+            eps0 |= (e_q[3] == e_qmax[0]) << 3;
+          }
+          lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
+          max_e = ojph_max(lep[0], lep[1]) - 1;
+          lep[0] = (ui8)e_q[3];
+          lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
+          int c_q1 = lcxp[0] + (lcxp[1] << 2);
+          lcxp[0] = (ui8)((rho[0] & 8) >> 3);
+          ui16 tuple0 = vlc_tbl1[(c_q0 << 8) + (rho[0] << 4) + eps0];
+          vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7);
+
+          if (c_q0 == 0)
+              mel_encode(&mel, rho[0] != 0);
+
+          int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0;
+          ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m);
+          m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0;
+          ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m);
+          m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0;
+          ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m);
+          m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0;
+          ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m);
+
+          if (x + 2 < width)
+          {
+            t = sp[0];
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] = 1;
+              e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = e_q[4];
+              s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            t = y + 1 < height ? sp[stride] : 0;
+            ++sp;
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] += 2;
+              e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = ojph_max(e_qmax[1], e_q[5]);
+              s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            if (x + 3 < width)
+            {
+              t = sp[0];
+              val = t + t; //multiply by 2 and get rid of sign
+              val >>= p; // 2 \mu_p + x
+              val &= ~1ULL;// 2 \mu_p
+              if (val)
+              {
+                rho[1] += 4;
+                e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+                e_qmax[1] = ojph_max(e_qmax[1], e_q[6]);
+                s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+              }
+
+              t = y + 1 < height ? sp[stride] : 0;
+              ++sp;
+              val = t + t; //multiply by 2 and get rid of sign
+              val >>= p; // 2 \mu_p + x
+              val &= ~1ULL;// 2 \mu_p
+              if (val)
+              {
+                rho[1] += 8;
+                e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+                e_qmax[1] = ojph_max(e_qmax[1], e_q[7]);
+                s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+              }
+            }
+
+            kappa = (rho[1] & (rho[1]-1)) ? ojph_max(1,max_e) : 1;
+            c_q1 |= ((rho[0] & 4) >> 1) | ((rho[0] & 8) >> 2);
+            int Uq1 = ojph_max(e_qmax[1], kappa);
+            u_q1 = Uq1 - kappa;
+
+            int eps1 = 0;
+            if (u_q1 > 0)
+            {
+              eps1 |= (e_q[4] == e_qmax[1]);
+              eps1 |= (e_q[5] == e_qmax[1]) << 1;
+              eps1 |= (e_q[6] == e_qmax[1]) << 2;
+              eps1 |= (e_q[7] == e_qmax[1]) << 3;
+            }
+            lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++;
+            max_e = ojph_max(lep[0], lep[1]) - 1;
+            lep[0] = (ui8)e_q[7];
+            lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++;
+            c_q0 = lcxp[0] + (lcxp[1] << 2);
+            lcxp[0] = (ui8)((rho[1] & 8) >> 3);
+            ui16 tuple1 = vlc_tbl1[(c_q1 << 8) + (rho[1] << 4) + eps1];
+            vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7);
+
+            if (c_q1 == 0)
+              mel_encode(&mel, rho[1] != 0);
+
+            int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0;
+            ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m);
+            m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0;
+            ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m);
+            m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0;
+            ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m);
+            m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0;
+            ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m);
+          }
+
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len);
 
           //prepare for next iteration
           c_q0 |= ((rho[1] & 4) >> 1) | ((rho[1] & 8) >> 2);
diff --git a/src/core/coding/ojph_block_encoder.h b/src/core/coding/ojph_block_encoder.h
index 0c4b9267..c0af8927 100644
--- a/src/core/coding/ojph_block_encoder.h
+++ b/src/core/coding/ojph_block_encoder.h
@@ -52,11 +52,25 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void
-      ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes,
-                            ui32 width, ui32 height, ui32 stride,
-                            ui32* lengths, 
-                            ojph::mem_elastic_allocator *elastic,
-                            ojph::coded_lists *& coded);
+      ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes,
+                              ui32 width, ui32 height, ui32 stride,
+                              ui32* lengths, 
+                              ojph::mem_elastic_allocator *elastic,
+                              ojph::coded_lists *& coded);
+
+    void
+      ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes,
+                              ui32 width, ui32 height, ui32 stride,
+                              ui32* lengths, 
+                              ojph::mem_elastic_allocator *elastic,
+                              ojph::coded_lists *& coded);
+
+    void
+      ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs,
+                                 ui32 num_passes, ui32 width, ui32 height,
+                                 ui32 stride, ui32* lengths,
+                                 ojph::mem_elastic_allocator* elastic,
+                                 ojph::coded_lists*& coded);
 
     void
       ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs, 
@@ -64,6 +78,10 @@ namespace ojph {
                                    ui32 stride, ui32* lengths,
                                    ojph::mem_elastic_allocator *elastic,
                                    ojph::coded_lists *& coded);
+
+    bool initialize_block_encoder_tables();
+    bool initialize_block_encoder_tables_avx2();
+    bool initialize_block_encoder_tables_avx512();
   }
 }
 
diff --git a/src/core/coding/ojph_block_encoder_avx2.cpp b/src/core/coding/ojph_block_encoder_avx2.cpp
new file mode 100644
index 00000000..91d0195d
--- /dev/null
+++ b/src/core/coding/ojph_block_encoder_avx2.cpp
@@ -0,0 +1,1229 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman
+// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019, The University of New South Wales, Australia
+// Copyright (c) 2024, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_block_encoder_avx2.cpp
+//***************************************************************************/
+
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
+#include <cassert>
+#include <cstring>
+#include <cstdint>
+#include <climits>
+#include <immintrin.h>
+#include <mutex>
+
+#include "ojph_mem.h"
+#include "ojph_arch.h"
+#include "ojph_block_encoder.h"
+#include "ojph_message.h"
+
+#ifdef OJPH_COMPILER_MSVC
+  #define likely(x)       (x)
+  #define unlikely(x)     (x)
+#else
+  #define likely(x)       __builtin_expect((x), 1)
+  #define unlikely(x)     __builtin_expect((x), 0)
+#endif
+
+namespace ojph {
+  namespace local {
+
+    /////////////////////////////////////////////////////////////////////////
+    // tables
+    /////////////////////////////////////////////////////////////////////////
+
+    //VLC encoding
+    // index is (c_q << 8) + (rho << 4) + eps
+    // data is  (cwd << 8) + (cwd_len << 4) + eps
+    // table 0 is for the initial line of quads
+    static ui32 vlc_tbl0[2048];
+    static ui32 vlc_tbl1[2048];
+
+    //UVLC encoding
+    static ui32 ulvc_cwd_pre[33];
+    static int ulvc_cwd_pre_len[33];
+    static ui32 ulvc_cwd_suf[33];
+    static int ulvc_cwd_suf_len[33];
+
+    /////////////////////////////////////////////////////////////////////////
+    static bool vlc_init_tables()
+    {
+      struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
+      vlc_src_table tbl0[] = {
+    #include "table0.h"
+      };
+      size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table);
+
+      si32 pattern_popcnt[16];
+      for (ui32 i = 0; i < 16; ++i)
+        pattern_popcnt[i] = (si32)population_count(i);
+
+      vlc_src_table* src_tbl = tbl0;
+      ui32 *tgt_tbl = vlc_tbl0;
+      size_t tbl_size = tbl0_size;
+      for (int i = 0; i < 2048; ++i)
+      {
+        int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
+        if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
+          tgt_tbl[i] = 0;
+        else
+        {
+          vlc_src_table *best_entry = NULL;
+          if (emb) // u_off = 1
+          {
+            int best_e_k = -1;
+            for (size_t j = 0; j < tbl_size; ++j)
+            {
+              if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
+                if (src_tbl[j].u_off == 1)
+                  if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
+                  {
+                    //now we need to find the smallest cwd with the highest
+                    // number of bits set in e_k
+                    int ones_count = pattern_popcnt[src_tbl[j].e_k];
+                    if (ones_count >= best_e_k)
+                    {
+                      best_entry = src_tbl + j;
+                      best_e_k = ones_count;
+                    }
+                  }
+            }
+          }
+          else // u_off = 0
+          {
+            for (size_t j = 0; j < tbl_size; ++j)
+            {
+              if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
+                if (src_tbl[j].u_off == 0)
+                {
+                  best_entry = src_tbl + j;
+                  break;
+                }
+            }
+          }
+          assert(best_entry);
+          tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
+                             + best_entry->e_k);
+        }
+      }
+
+      vlc_src_table tbl1[] = {
+    #include "table1.h"
+      };
+      size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table);
+
+      src_tbl = tbl1;
+      tgt_tbl = vlc_tbl1;
+      tbl_size = tbl1_size;
+      for (int i = 0; i < 2048; ++i)
+      {
+        int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
+        if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
+          tgt_tbl[i] = 0;
+        else
+        {
+          vlc_src_table *best_entry = NULL;
+          if (emb) // u_off = 1
+          {
+            int best_e_k = -1;
+            for (size_t j = 0; j < tbl_size; ++j)
+            {
+              if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
+                if (src_tbl[j].u_off == 1)
+                  if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
+                  {
+                    //now we need to find the smallest cwd with the highest
+                    // number of bits set in e_k
+                    int ones_count = pattern_popcnt[src_tbl[j].e_k];
+                    if (ones_count >= best_e_k)
+                    {
+                      best_entry = src_tbl + j;
+                      best_e_k = ones_count;
+                    }
+                  }
+            }
+          }
+          else // u_off = 0
+          {
+            for (size_t j = 0; j < tbl_size; ++j)
+            {
+              if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
+                if (src_tbl[j].u_off == 0)
+                {
+                  best_entry = src_tbl + j;
+                  break;
+                }
+            }
+          }
+          assert(best_entry);
+          tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
+                             + best_entry->e_k);
+        }
+      }
+
+
+      return true;
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static bool uvlc_init_tables()
+    {
+      //code goes from 0 to 31, extension and 32 are not supported here
+      ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
+      ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
+      ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
+      ulvc_cwd_pre_len[2] = 2;
+      ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
+      ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
+      ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
+      ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
+      ulvc_cwd_suf_len[2] = 0;
+      ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
+      for (int i = 5; i < 33; ++i)
+      {
+        ulvc_cwd_pre[i] = 0;
+        ulvc_cwd_pre_len[i] = 3;
+        ulvc_cwd_suf[i] = (ui32)(i-5);
+        ulvc_cwd_suf_len[i] = 5;
+      }
+      return true;
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    bool initialize_block_encoder_tables_avx2() {
+      static bool tables_initialized = false;
+      static std::once_flag tables_initialized_flag;
+      std::call_once(tables_initialized_flag, []() {
+        memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
+        memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
+        tables_initialized = vlc_init_tables();
+        tables_initialized = tables_initialized && uvlc_init_tables();
+      });
+      return tables_initialized;
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    //
+    /////////////////////////////////////////////////////////////////////////
+    struct mel_struct {
+      //storage
+      ui8* buf;      //pointer to data buffer
+      ui32 pos;      //position of next writing within buf
+      ui32 buf_size; //size of buffer, which we must not exceed
+
+      // all these can be replaced by bytes
+      int remaining_bits; //number of empty bits in tmp
+      int tmp;            //temporary storage of coded bits
+      int run;            //number of 0 run
+      int k;              //state
+      int threshold;      //threshold where one bit must be coded
+    };
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    mel_init(mel_struct* melp, ui32 buffer_size, ui8* data)
+    {
+      melp->buf = data;
+      melp->pos = 0;
+      melp->buf_size = buffer_size;
+      melp->remaining_bits = 8;
+      melp->tmp = 0;
+      melp->run = 0;
+      melp->k = 0;
+      melp->threshold = 1; // this is 1 << mel_exp[melp->k];
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    mel_emit_bit(mel_struct* melp, int v)
+    {
+      melp->tmp = (melp->tmp << 1) + v;
+      melp->remaining_bits--;
+      if (melp->remaining_bits == 0) {
+        melp->buf[melp->pos++] = (ui8)melp->tmp;
+        melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8);
+        melp->tmp = 0;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    mel_encode(mel_struct* melp, bool bit)
+    {
+      //MEL exponent
+      static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
+
+      if (bit == false) {
+        ++melp->run;
+        if (melp->run >= melp->threshold) {
+          mel_emit_bit(melp, 1);
+          melp->run = 0;
+          melp->k = ojph_min(12, melp->k + 1);
+          melp->threshold = 1 << mel_exp[melp->k];
+        }
+      } else {
+        mel_emit_bit(melp, 0);
+        int t = mel_exp[melp->k];
+        while (t > 0) {
+          mel_emit_bit(melp, (melp->run >> --t) & 1);
+        }
+        melp->run = 0;
+        melp->k = ojph_max(0, melp->k - 1);
+        melp->threshold = 1 << mel_exp[melp->k];
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    //
+    /////////////////////////////////////////////////////////////////////////
+    struct vlc_struct_avx2 {
+      //storage
+      ui8* buf;      //pointer to data buffer
+      ui32 pos;      //position of next writing within buf
+      ui32 buf_size; //size of buffer, which we must not exceed
+
+      int used_bits; //number of occupied bits in tmp
+      ui64 tmp;       //temporary storage of coded bits
+      bool last_greater_than_8F; //true if last byte us greater than 0x8F
+    };
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    vlc_init(vlc_struct_avx2* vlcp, ui32 buffer_size, ui8* data)
+    {
+      vlcp->buf = data + buffer_size - 1; //points to last byte
+      vlcp->pos = 1;                      //locations will be all -pos
+      vlcp->buf_size = buffer_size;
+
+      vlcp->buf[0] = 0xFF;
+      vlcp->used_bits = 4;
+      vlcp->tmp = 0xF;
+      vlcp->last_greater_than_8F = true;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    vlc_encode(vlc_struct_avx2* vlcp, ui32 cwd, int cwd_len)
+    {
+      vlcp->tmp |= (ui64)cwd << vlcp->used_bits;
+      vlcp->used_bits += cwd_len;
+
+      while (vlcp->used_bits >= 8) {
+          ui8 tmp;
+
+          if (unlikely(vlcp->last_greater_than_8F)) {
+              tmp = vlcp->tmp & 0x7F;
+
+              if (likely(tmp != 0x7F)) {
+                  tmp = vlcp->tmp & 0xFF;
+                  *(vlcp->buf - vlcp->pos) = tmp;
+                  vlcp->last_greater_than_8F = tmp > 0x8F;
+                  vlcp->tmp >>= 8;
+                  vlcp->used_bits -= 8;
+              } else {
+                  *(vlcp->buf - vlcp->pos) = tmp;
+                  vlcp->last_greater_than_8F = false;
+                  vlcp->tmp >>= 7;
+                  vlcp->used_bits -= 7;
+              }
+
+          } else {
+              tmp = vlcp->tmp & 0xFF;
+              *(vlcp->buf - vlcp->pos) = tmp;
+              vlcp->last_greater_than_8F = tmp > 0x8F;
+              vlcp->tmp >>= 8;
+              vlcp->used_bits -= 8;
+          }
+
+          vlcp->pos++;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    terminate_mel_vlc(mel_struct* melp, vlc_struct_avx2* vlcp)
+    {
+      if (melp->run > 0)
+        mel_emit_bit(melp, 1);
+
+      if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
+        *(vlcp->buf - vlcp->pos) = 0x7f;
+        vlcp->pos++;
+        vlcp->tmp >>= 7;
+        vlcp->used_bits -= 7;
+      }
+
+      melp->tmp = melp->tmp << melp->remaining_bits;
+      int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
+      int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
+      if ((mel_mask | vlc_mask) == 0)
+        return;  //last mel byte cannot be 0xFF, since then
+                 //melp->remaining_bits would be < 8
+      if (melp->pos >= melp->buf_size)
+        OJPH_ERROR(0x00020003, "mel encoder's buffer is full");
+      ui8 vlcp_tmp = (ui8)vlcp->tmp;
+      int fuse = melp->tmp | vlcp_tmp;
+      if ( ( ((fuse ^ melp->tmp) & mel_mask)
+           | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
+          && (fuse != 0xFF) && vlcp->pos > 1)
+      {
+        melp->buf[melp->pos++] = (ui8)fuse;
+      }
+      else
+      {
+        if (vlcp->pos >= vlcp->buf_size)
+          OJPH_ERROR(0x00020004, "vlc encoder's buffer is full");
+        melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF
+        *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp;
+        vlcp->pos++;
+      }
+    }
+
+/////////////////////////////////////////////////////////////////////////
+//
+/////////////////////////////////////////////////////////////////////////
+    struct ms_struct {
+      //storage
+      ui8* buf;      //pointer to data buffer
+      ui32 pos;      //position of next writing within buf
+      ui32 buf_size; //size of buffer, which we must not exceed
+
+      int max_bits;  //maximum number of bits that can be store in tmp
+      int used_bits; //number of occupied bits in tmp
+      ui32 tmp;      //temporary storage of coded bits
+    };
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    ms_init(ms_struct* msp, ui32 buffer_size, ui8* data)
+    {
+      msp->buf = data;
+      msp->pos = 0;
+      msp->buf_size = buffer_size;
+      msp->max_bits = 8;
+      msp->used_bits = 0;
+      msp->tmp = 0;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    ms_encode(ms_struct* msp, ui64 cwd, int cwd_len)
+    {
+      while (cwd_len > 0)
+      {
+        if (msp->pos >= msp->buf_size)
+          OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
+        int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
+        msp->tmp |= ((ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
+        msp->used_bits += t;
+        cwd >>= t;
+        cwd_len -= t;
+        if (msp->used_bits >= msp->max_bits)
+        {
+          msp->buf[msp->pos++] = (ui8)msp->tmp;
+          msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
+          msp->tmp = 0;
+          msp->used_bits = 0;
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    ms_terminate(ms_struct* msp)
+    {
+      if (msp->used_bits)
+      {
+        int t = msp->max_bits - msp->used_bits; //unused bits
+        msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
+        msp->used_bits += t;
+        if (msp->tmp != 0xFF)
+        {
+          if (msp->pos >= msp->buf_size)
+            OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full");
+          msp->buf[msp->pos++] = (ui8)msp->tmp;
+        }
+      }
+      else if (msp->max_bits == 7)
+        msp->pos--;
+    }
+
+#define ZERO _mm256_setzero_si256()
+#define ONE  _mm256_set1_epi32(1)
+
+// https://stackoverflow.com/a/58827596
+inline __m256i avx2_lzcnt_epi32(__m256i v) {
+    // prevent value from being rounded up to the next power of two
+    v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);  // keep 8 MSB
+
+    v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));    // convert an integer to float
+    v = _mm256_srli_epi32(v, 23);                   // shift down the exponent
+    v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);  // undo bias
+    v = _mm256_min_epi16(v, _mm256_set1_epi32(32));    // clamp at 32
+
+    return v;
+}
+
+inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) {
+    return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff));
+}
+
+static void proc_pixel(__m256i *src_vec, ui32 p,
+                       __m256i *eq_vec, __m256i *s_vec,
+                       __m256i &rho_vec, __m256i &e_qmax_vec)
+{
+    __m256i val_vec[4];
+    __m256i _eq_vec[4];
+    __m256i _s_vec[4];
+    __m256i _rho_vec[4];
+
+    for (ui32 i = 0; i < 4; ++i) {
+        /* val = t + t; //multiply by 2 and get rid of sign */
+        val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
+
+        /* val >>= p;  // 2 \mu_p + x */
+        val_vec[i] = _mm256_srli_epi32(val_vec[i], (int)p);
+
+        /* val &= ~1u; // 2 \mu_p */
+        val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((int)~1u));
+
+        /* if (val) { */
+        const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO);
+
+        /*   rho[i] = 1 << i;
+         *   rho is processed below.
+         */
+
+        /*   e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */
+        val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
+        _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]);
+        _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
+
+        /*   e_qmax[i] = ojph_max(e_qmax[i], e_q[j]);
+         *   e_qmax is processed below
+         */
+
+        /*   s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
+        val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
+        _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
+        _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
+
+        _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
+        _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
+        val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
+        /* } */
+    }
+
+    const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
+
+    /* Reorder from
+     * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7]
+     * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],.[1, 6], [1, 7]
+     * *_vec[2]:[0, 8], [0, 9], [0,10], [0,11], [0,12], [0,13], [0,14], [0,15]
+     * *_vec[3]:[1, 8], [1, 9], [1,10], [1,11], [1,12], [1,13], [1,14], [1,15]
+     * to
+     * *_vec[0]:[0, 0], [0, 2], [0, 4], [0, 6], [0, 8], [0,10], [0,12], [0,14]
+     * *_vec[1]:[1, 0], [1, 2], [1, 4], [1, 6], [1, 8], [1,10], [1,12], [1,14]
+     * *_vec[2]:[0, 1], [0, 3], [0, 5], [0, 7], [0, 9], [0,11], [0,13], [0,15]
+     * *_vec[3]:[1, 1], [1, 3], [1, 5], [1, 7], [1, 9], [1,11], [1,13], [1,15]
+     */
+    __m256i tmp1, tmp2;
+    for (ui32 i = 0; i < 2; ++i) {
+        tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
+        tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
+        eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
+        eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
+
+        tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
+        tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
+        s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
+        s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
+
+        tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
+        tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
+        _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
+        _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
+    }
+
+    e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
+    e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
+    e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
+    _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
+    _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
+    _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
+    rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
+    rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
+    rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
+}
+
+/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...]
+ *      [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...]
+ *      [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...]
+ *      [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...]
+ *
+ * to   [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31,
+ *       0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33]
+ *
+ *      [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35,
+ *       0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37]
+ *
+ *      [..]
+ */
+static void rotate_matrix(__m256i *matrix)
+{
+    __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
+    __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
+    __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
+    __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
+
+    matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
+    matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
+    matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
+    matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
+
+    tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
+    matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
+    matrix[0] = tmp1;
+
+    tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
+    matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
+    matrix[1] = tmp1;
+}
+
+static void proc_ms_encode(ms_struct *msp,
+                           __m256i &tuple_vec,
+                           __m256i &uq_vec,
+                           __m256i &rho_vec,
+                           __m256i *s_vec)
+{
+    __m256i m_vec[4];
+
+    /* Prepare parameters for ms_encode */
+    /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */
+    auto tmp = _mm256_and_si256(tuple_vec, ONE);
+    tmp = _mm256_sub_epi32(uq_vec, tmp);
+    auto tmp1 = _mm256_and_si256(rho_vec, ONE);
+    auto mask = avx2_cmpneq_epi32(tmp1, ZERO);
+    m_vec[0] = _mm256_and_si256(mask, tmp);
+
+    /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */
+    tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
+    tmp = _mm256_srli_epi32(tmp, 1);
+    tmp = _mm256_sub_epi32(uq_vec, tmp);
+    tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
+    mask = avx2_cmpneq_epi32(tmp1, ZERO);
+    m_vec[1] = _mm256_and_si256(mask, tmp);
+
+    /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */
+    tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
+    tmp = _mm256_srli_epi32(tmp, 2);
+    tmp = _mm256_sub_epi32(uq_vec, tmp);
+    tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
+    mask = avx2_cmpneq_epi32(tmp1, ZERO);
+    m_vec[2] = _mm256_and_si256(mask, tmp);
+
+    /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */
+    tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
+    tmp = _mm256_srli_epi32(tmp, 3);
+    tmp = _mm256_sub_epi32(uq_vec, tmp);
+    tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
+    mask = avx2_cmpneq_epi32(tmp1, ZERO);
+    m_vec[3] = _mm256_and_si256(mask, tmp);
+
+    rotate_matrix(m_vec);
+    /* s_vec from
+     * s_vec[0]:[0, 0], [0, 2] ... [0,14], [0, 16], [0, 18] ... [0,30]
+     * s_vec[1]:[1, 0], [1, 2] ... [1,14], [1, 16], [1, 18] ... [1,30]
+     * s_vec[2]:[0, 1], [0, 3] ... [0,15], [0, 17], [0, 19] ... [0,31]
+     * s_vec[3]:[1, 1], [1, 3] ... [1,15], [1, 17], [1, 19] ... [1,31]
+     * to
+     * s_vec[0]:[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]...[0, 7], [1, 7]
+     * s_vec[1]:[0, 8], [1, 8], [0, 9], [1, 9], [0,10], [1,10]...[0,15], [1,15]
+     * s_vec[2]:[0,16], [1,16], [0,17], [1,17], [0,18], [1,18]...[0,23], [1,23]
+     * s_vec[3]:[0,24], [1,24], [0,25], [1,25], [0,26], [1,26]...[0,31], [1,31]
+     */
+    rotate_matrix(s_vec);
+
+    ui32 cwd[8];
+    int cwd_len[8];
+    ui64 _cwd = 0;
+    int _cwd_len = 0;
+
+    /* Each iteration process 8 bytes * 2 lines */
+    for (ui32 i = 0; i < 4; ++i) {
+        /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
+         * cwd_len = m
+         */
+        _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
+        tmp = _mm256_sllv_epi32(ONE, m_vec[i]);
+        tmp = _mm256_sub_epi32(tmp, ONE);
+        tmp = _mm256_and_si256(tmp, s_vec[i]);
+        _mm256_storeu_si256((__m256i*)cwd, tmp);
+
+        for (ui32 j = 0; j < 4; ++j) {
+            ui32 idx = j * 2;
+            _cwd     = cwd[idx];
+            _cwd_len = cwd_len[idx];
+            _cwd     |= ((ui64)cwd[idx + 1]) << _cwd_len;
+            _cwd_len += cwd_len[idx + 1];
+            ms_encode(msp, _cwd, _cwd_len);
+        }
+    }
+}
+
+static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec,
+                           __m256i &e_qmax_vec)
+{
+    /* if (u_q[i] > 0) {
+     *     eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]);
+     *     eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1;
+     *     eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2;
+     *     eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3;
+     * }
+     */
+    auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO);
+
+    auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
+    auto eps_vec = _mm256_srli_epi32(mask, 31);
+
+    mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
+    auto tmp = _mm256_srli_epi32(mask, 31);
+    tmp = _mm256_slli_epi32(tmp, 1);
+    eps_vec = _mm256_or_si256(eps_vec, tmp);
+
+    mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
+    tmp = _mm256_srli_epi32(mask, 31);
+    tmp = _mm256_slli_epi32(tmp, 2);
+    eps_vec = _mm256_or_si256(eps_vec, tmp);
+
+    mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
+    tmp = _mm256_srli_epi32(mask, 31);
+    tmp = _mm256_slli_epi32(tmp, 3);
+    eps_vec = _mm256_or_si256(eps_vec, tmp);
+
+    return  _mm256_and_si256(u_q_mask, eps_vec);
+}
+
+static void update_lep(ui32 x, __m256i &prev_e_val_vec,
+                       __m256i *eq_vec, __m256i *e_val_vec,
+                       const __m256i left_shift)
+{
+    /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
+     * lep[0] = (ui8)e_q[3];
+     * Compare e_q[1] with e_q[3] of the prevous round.
+     */
+    auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift);
+    tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0);
+    prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0);
+    e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp);
+}
+
+
+static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec,
+                        __m256i &rho_vec, __m256i *cx_val_vec,
+                        const __m256i left_shift)
+{
+    /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
+     * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
+     * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
+     */
+    auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
+    tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
+    prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
+
+    tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
+    tmp = _mm256_srli_epi32(tmp, 3);
+
+    auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
+    tmp1 = _mm256_srli_epi32(tmp1, 1);
+    cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
+}
+
+static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec,
+                         __m256i &eps_vec, ui32 *vlc_tbl)
+{
+    /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */
+    auto tmp = _mm256_slli_epi32(cq_vec, 8);
+    auto tmp1 = _mm256_slli_epi32(rho_vec, 4);
+    tmp = _mm256_add_epi32(tmp, tmp1);
+    tmp = _mm256_add_epi32(tmp, eps_vec);
+    return _mm256_i32gather_epi32((const int *)vlc_tbl, tmp, 4);
+}
+
+static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
+                        const __m256i right_shift)
+{
+    ojph_unused(x);
+    ojph_unused(cx_val_vec);
+    ojph_unused(right_shift);
+
+    /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */
+    auto tmp = _mm256_srli_epi32(rho_vec, 1);
+    auto tmp1 = _mm256_and_si256(rho_vec, ONE);
+    return _mm256_or_si256(tmp, tmp1);
+}
+
+static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
+                        const __m256i right_shift)
+{
+    // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
+    //            | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
+    auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
+    auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
+
+#ifdef OJPH_ARCH_X86_64
+    tmp = _mm256_insert_epi64(tmp, 
+      _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
+#elif (defined OJPH_ARCH_I386)
+    int lsb = _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1]));
+    tmp = _mm256_insert_epi32(tmp, lsb, 6);
+    int msb = _mm_extract_epi32(_mm256_castsi256_si128(cx_val_vec[x + 1]), 1);
+    tmp = _mm256_insert_epi32(tmp, msb, 7);
+#else
+    #error Error unsupport compiler
+#endif
+    tmp = _mm256_slli_epi32(tmp, 2);
+    auto tmp1 = _mm256_insert_epi32(lcxp1_vec, 
+      _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
+    tmp = _mm256_add_epi32(tmp1, tmp);
+
+    tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
+    tmp1 = _mm256_srli_epi32(tmp1, 1);
+    tmp = _mm256_or_si256(tmp, tmp1);
+
+    tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
+    tmp1 = _mm256_srli_epi32(tmp1, 2);
+
+    return _mm256_or_si256(tmp, tmp1);
+}
+
+using fn_proc_cq = __m256i (*)(ui32, __m256i *, __m256i &, const __m256i);
+
+static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec,
+                             __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
+                             const __m256i right_shift)
+{
+    int32_t mel_need_encode[8];
+    int32_t mel_need_encode2[8];
+    int32_t mel_bit[8];
+    int32_t mel_bit2[8];
+    /* Prepare mel_encode params */
+    /* if (c_q[i] == 0) { */
+    _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
+    /*   mel_encode(&mel, rho[i] != 0); */
+    _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
+    /* } */
+
+    /*   mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */
+    auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
+    auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
+    _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
+
+    /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
+    auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO);
+    _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO)));
+
+    ui32 i_max = 8 - (ignore / 2);
+
+    for (ui32 i = 0; i < i_max; i += 2) {
+        if (mel_need_encode[i]) {
+            mel_encode(melp, mel_bit[i]);
+        }
+
+        if (i + 1 < i_max) {
+            if (mel_need_encode[i + 1]) {
+                mel_encode(melp, mel_bit[i + 1]);
+            }
+        }
+
+        if (mel_need_encode2[i]) {
+            mel_encode(melp, mel_bit2[i]);
+        }
+    }
+}
+
+static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec,
+                             __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
+                             const __m256i right_shift)
+{
+    ojph_unused(u_q_vec);
+    ojph_unused(right_shift);
+    int32_t mel_need_encode[8];
+    int32_t mel_bit[8];
+
+    /* Prepare mel_encode params */
+    /* if (c_q[i] == 0) { */
+    _mm256_storeu_si256((__m256i*)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
+    /*   mel_encode(&mel, rho[i] != 0); */
+    _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
+    /* } */
+
+    ui32 i_max = 8 - (ignore / 2);
+
+    for (ui32 i = 0; i < i_max; ++i) {
+        if (mel_need_encode[i]) {
+            mel_encode(melp, mel_bit[i]);
+        }
+    }
+}
+
+using fn_proc_mel_encode = void (*)(mel_struct *, __m256i &, __m256i &,
+                                    __m256i, ui32, const __m256i);
+
+static void proc_vlc_encode1(vlc_struct_avx2 *vlcp, ui32 *tuple,
+                             ui32 *u_q, ui32 ignore)
+{
+    ui32 i_max = 8 - (ignore / 2);
+
+    for (ui32 i = 0; i < i_max; i += 2) {
+        /* 7 bits */
+        ui32 val = tuple[i + 0] >> 4;
+        int size = tuple[i + 0] & 7;
+
+        if (i + 1 < i_max) {
+            /* 7 bits */
+            val |= (tuple[i + 1] >> 4) << size;
+            size += tuple[i + 1] & 7;
+        }
+
+        if (u_q[i] > 2 && u_q[i + 1] > 2) {
+            /* 3 bits */
+            val |= (ulvc_cwd_pre[u_q[i] - 2]) << size;
+            size += ulvc_cwd_pre_len[u_q[i] - 2];
+
+            /* 3 bits */
+            val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size;
+            size += ulvc_cwd_pre_len[u_q[i + 1] - 2];
+
+            /* 5 bits */
+            val |= (ulvc_cwd_suf[u_q[i] - 2]) << size;
+            size += ulvc_cwd_suf_len[u_q[i] - 2];
+
+            /* 5 bits */
+            val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size;
+            size += ulvc_cwd_suf_len[u_q[i + 1] - 2];
+
+        } else if (u_q[i] > 2 && u_q[i + 1] > 0) {
+            /* 3 bits */
+            val |= (ulvc_cwd_pre[u_q[i]]) << size;
+            size += ulvc_cwd_pre_len[u_q[i]];
+
+            /* 1 bit */
+            val |= (u_q[i + 1] - 1) << size;
+            size += 1;
+
+            /* 5 bits */
+            val |= (ulvc_cwd_suf[u_q[i]]) << size;
+            size += ulvc_cwd_suf_len[u_q[i]];
+
+        } else {
+            /* 3 bits */
+            val |= (ulvc_cwd_pre[u_q[i]]) << size;
+            size += ulvc_cwd_pre_len[u_q[i]];
+
+            /* 3 bits */
+            val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
+            size += ulvc_cwd_pre_len[u_q[i + 1]];
+
+            /* 5 bits */
+            val |= (ulvc_cwd_suf[u_q[i]]) << size;
+            size += ulvc_cwd_suf_len[u_q[i]];
+
+            /* 5 bits */
+            val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
+            size += ulvc_cwd_suf_len[u_q[i + 1]];
+        }
+
+        vlc_encode(vlcp, val, size);
+    }
+}
+
+static void proc_vlc_encode2(vlc_struct_avx2 *vlcp, ui32 *tuple,
+                             ui32 *u_q, ui32 ignore)
+{
+    ui32 i_max = 8 - (ignore / 2);
+
+    for (ui32 i = 0; i < i_max; i += 2) {
+        /* 7 bits */
+        ui32 val = tuple[i + 0] >> 4;
+        int size = tuple[i + 0] & 7;
+
+        if (i + 1 < i_max) {
+            /* 7 bits */
+            val |= (tuple[i + 1] >> 4) << size;
+            size += tuple[i + 1] & 7;
+        }
+
+        /* 3 bits */
+        val |= ulvc_cwd_pre[u_q[i]] << size;
+        size += ulvc_cwd_pre_len[u_q[i]];
+
+        /* 3 bits */
+        val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
+        size += ulvc_cwd_pre_len[u_q[i + 1]];
+
+        /* 5 bits */
+        val |= (ulvc_cwd_suf[u_q[i + 0]]) << size;
+        size += ulvc_cwd_suf_len[u_q[i + 0]];
+
+        /* 5 bits */
+        val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
+        size += ulvc_cwd_suf_len[u_q[i + 1]];
+
+        vlc_encode(vlcp, val, size);
+    }
+}
+
+using fn_proc_vlc_encode = void (*)(vlc_struct_avx2 *, ui32 *, ui32 *, ui32);
+
+void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs,
+                                ui32 num_passes, ui32 _width, ui32 height,
+                                ui32 stride, ui32* lengths,
+                                ojph::mem_elastic_allocator *elastic,
+                                ojph::coded_lists *& coded)
+{
+    ojph_unused(num_passes);                      //currently not used
+
+    ui32 width = (_width + 15) & ~15u;
+    ui32 ignore = width - _width;
+    const int ms_size = (16384 * 16 + 14) / 15; //more than enough
+    const int mel_vlc_size = 3072;              //more than enough
+    const int mel_size = 192;
+    const int vlc_size = mel_vlc_size - mel_size;
+
+    ui8 ms_buf[ms_size];
+    ui8 mel_vlc_buf[mel_vlc_size];
+    ui8 *mel_buf = mel_vlc_buf;
+    ui8 *vlc_buf = mel_vlc_buf + mel_size;
+
+    mel_struct mel;
+    mel_init(&mel, mel_size, mel_buf);
+    vlc_struct_avx2 vlc;
+    vlc_init(&vlc, vlc_size, vlc_buf);
+    ms_struct ms;
+    ms_init(&ms, ms_size, ms_buf);
+
+    const ui32 p = 30 - missing_msbs;
+
+    //e_val: E values for a line (these are the highest set bit)
+    //cx_val: is the context values
+    //Each byte stores the info for the 2 sample. For E, it is maximum
+    // of the two samples, while for cx, it is the OR of these two samples.
+    //The maximum is between the pixel at the bottom left of one quad
+    // and the bottom right of the earlier quad. The same is true for cx.
+    //For a 1024 pixels, we need 512 bytes, the 2 extra,
+    // one for the non-existing earlier quad, and one for beyond the
+    // the end
+    const __m256i right_shift = _mm256_set_epi32(
+        0, 7, 6, 5, 4, 3, 2, 1
+    );
+
+    const __m256i left_shift = _mm256_set_epi32(
+        6, 5, 4, 3, 2, 1, 0, 7
+    );
+
+    ui32 n_loop = (width + 15) / 16;
+
+    __m256i e_val_vec[65];
+    for (ui32 i = 0; i <ojph_min(64, n_loop); ++i) {
+        e_val_vec[i] = ZERO;
+    }
+    __m256i prev_e_val_vec = ZERO;
+
+    __m256i cx_val_vec[65];
+    __m256i prev_cx_val_vec = ZERO;
+
+    ui32 prev_cq = 0;
+
+    __m256i eq_vec[4];
+    __m256i s_vec[4];
+    __m256i src_vec[4];
+
+    ui32 *vlc_tbl = vlc_tbl0;
+    fn_proc_cq proc_cq = proc_cq1;
+    fn_proc_mel_encode proc_mel_encode = proc_mel_encode1;
+    fn_proc_vlc_encode proc_vlc_encode = proc_vlc_encode1;
+
+    /* 2 lines per iteration */
+    for (ui32 y = 0; y < height; y += 2)
+    {
+        e_val_vec[n_loop] = prev_e_val_vec;
+        /* lcxp[0] = (ui8)((rho[0] & 8) >> 3); */
+        __m256i tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
+        cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
+
+        prev_e_val_vec = ZERO;
+        prev_cx_val_vec = ZERO;
+
+        ui32 *sp = buf + y * stride;
+
+        /* 16 bytes per iteration */
+        for (ui32 x = 0; x < n_loop; ++x) {
+
+            /* t = sp[i]; */
+            if ((x == (n_loop - 1)) && (_width % 16)) {
+                ui32 tmp_buf[16] = { 0 };
+                memcpy(tmp_buf, sp, (_width % 16) * sizeof(ui32));
+                src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
+                src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
+                if (y + 1 < height) {
+                    memcpy(tmp_buf, sp + stride, (_width % 16) * sizeof(ui32));
+                    src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
+                    src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
+                }
+                else {
+                    src_vec[1] = ZERO;
+                    src_vec[3] = ZERO;
+                }
+            }
+            else {
+                src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
+                src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
+
+                if (y + 1 < height) {
+                    src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
+                    src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
+                }
+                else {
+                    src_vec[1] = ZERO;
+                    src_vec[3] = ZERO;
+                }
+                sp += 16;
+            }
+
+            /* src_vec layout:
+             * src_vec[0]:[0, 0],[0, 1],[0, 2],[0, 3],[0, 4],[0, 5],.[0, 6],.[0, 7]
+             * src_vec[1]:[1, 0],[1, 1],[1, 2],[1, 3],[1, 4],[1, 5],.[1, 6],.[1, 7]
+             * src_vec[2]:[0, 8],[0, 9],[0,10],[0,11],[0,12],[0,13],.[0,14], [0,15]
+             * src_vec[3]:[1, 8],[1, 9],[1,10],[1,11],[1,12],[1,13],.[1,14], [1,15]
+             */
+            __m256i rho_vec, e_qmax_vec;
+            proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
+
+            // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1;
+            tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
+            tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
+
+            auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
+            max_e_vec = _mm256_sub_epi32(max_e_vec, ONE);
+
+            // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1;
+            tmp = _mm256_max_epi32(max_e_vec, ONE);
+            __m256i tmp1 = _mm256_sub_epi32(rho_vec, ONE);
+            tmp1 = _mm256_and_si256(rho_vec, tmp1);
+
+            auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO);
+            auto kappa_vec1_ = _mm256_and_si256(cmp, ONE);
+            auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
+            const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
+
+            /* cq[1 - 16] = cq_vec
+             * cq[0] = prev_cq_vec[0]
+             */
+            tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
+
+            auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
+            cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
+            prev_cq = (ui32)_mm256_extract_epi32(tmp, 7);
+
+            update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
+            update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
+
+            /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */
+            /* u_q[i] = Uq[i] - kappa[i]; */
+            auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
+            auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
+
+            auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
+            __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
+            ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
+
+            proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
+                            right_shift);
+
+            proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
+
+            // vlc_encode(&vlc, tuple[i*2+0] >> 8, (tuple[i*2+0] >> 4) & 7);
+            // vlc_encode(&vlc, tuple[i*2+1] >> 8, (tuple[i*2+1] >> 4) & 7);
+            ui32 u_q[8];
+            ui32 tuple[8];
+            /* The tuple is scaled by 4 due to:
+             * vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7, true);
+             * So in the vlc_encode, the tuple will only be scaled by 2.
+             */
+            tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
+            _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
+            _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
+
+            proc_vlc_encode(&vlc, tuple, u_q, _ignore);
+        }
+
+        tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
+        tmp = _mm256_slli_epi32(tmp, 2);
+        tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
+        prev_cq = (ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
+
+        proc_cq = proc_cq2;
+        vlc_tbl = vlc_tbl1;
+        proc_mel_encode = proc_mel_encode2;
+        proc_vlc_encode = proc_vlc_encode2;
+    }
+
+    ms_terminate(&ms);
+    terminate_mel_vlc(&mel, &vlc);
+
+    //copy to elastic
+    lengths[0] = mel.pos + vlc.pos + ms.pos;
+    elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
+    memcpy(coded->buf, ms.buf, ms.pos);
+    memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
+    memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
+
+    // put in the interface locator word
+    ui32 num_bytes = mel.pos + vlc.pos;
+    coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
+    coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
+    coded->buf[lengths[0]-2] =
+        (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
+
+    coded->avail_size -= lengths[0];
+}
+
+} /* namespace local */
+} /* namespace ojph */
+
+#endif
diff --git a/src/core/coding/ojph_block_encoder_avx512.cpp b/src/core/coding/ojph_block_encoder_avx512.cpp
index 5912b09f..3ae76841 100644
--- a/src/core/coding/ojph_block_encoder_avx512.cpp
+++ b/src/core/coding/ojph_block_encoder_avx512.cpp
@@ -34,14 +34,17 @@
 // File: ojph_block_encoder_avx512.cpp
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_X86_64)
+
 #include <cassert>
 #include <cstring>
 #include <cstdint>
 #include <climits>
 #include <immintrin.h>
+#include <mutex>
 
 #include "ojph_mem.h"
-#include "ojph_arch.h"
 #include "ojph_block_encoder.h"
 #include "ojph_message.h"
 
@@ -64,8 +67,8 @@ namespace ojph {
     // index is (c_q << 8) + (rho << 4) + eps
     // data is  (cwd << 8) + (cwd_len << 4) + eps
     // table 0 is for the initial line of quads
-    static ui32 vlc_tbl0[2048] = { 0 };
-    static ui32 vlc_tbl1[2048] = { 0 };
+    static ui32 vlc_tbl0[2048];
+    static ui32 vlc_tbl1[2048];
 
     //UVLC encoding
     static ui32 ulvc_cwd_pre[33];
@@ -218,19 +221,18 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    bool initialize_tables() {
-      if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) {
-        bool result;
-        result = vlc_init_tables();
-        result = result && uvlc_init_tables();
-        return result;
-      }
-      return false;
+    bool initialize_block_encoder_tables_avx512() {
+      static bool tables_initialized = false;
+      static std::once_flag tables_initialized_flag;
+      std::call_once(tables_initialized_flag, []() {
+        memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
+        memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
+        tables_initialized = vlc_init_tables();
+        tables_initialized = tables_initialized && uvlc_init_tables();
+      });
+      return tables_initialized;
     }
 
-    /////////////////////////////////////////////////////////////////////////
-    static bool tables_initialized = initialize_tables();
-
     /////////////////////////////////////////////////////////////////////////
     //
     /////////////////////////////////////////////////////////////////////////
@@ -305,7 +307,7 @@ namespace ojph {
     /////////////////////////////////////////////////////////////////////////
     //
     /////////////////////////////////////////////////////////////////////////
-    struct vlc_struct {
+    struct vlc_struct_avx512 {
       //storage
       ui8* buf;      //pointer to data buffer
       ui32 pos;      //position of next writing within buf
@@ -318,7 +320,7 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     static inline void
-    vlc_init(vlc_struct* vlcp, ui32 buffer_size, ui8* data)
+    vlc_init(vlc_struct_avx512* vlcp, ui32 buffer_size, ui8* data)
     {
       vlcp->buf = data + buffer_size - 1; //points to last byte
       vlcp->pos = 1;                      //locations will be all -pos
@@ -332,7 +334,7 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     static inline void
-    vlc_encode(vlc_struct* vlcp, ui32 cwd, int cwd_len)
+    vlc_encode(vlc_struct_avx512* vlcp, ui32 cwd, int cwd_len)
     {
       vlcp->tmp |= (ui64)cwd << vlcp->used_bits;
       vlcp->used_bits += cwd_len;
@@ -372,11 +374,18 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
     static inline void
-    terminate_mel_vlc(mel_struct* melp, vlc_struct* vlcp)
+    terminate_mel_vlc(mel_struct* melp, vlc_struct_avx512* vlcp)
     {
       if (melp->run > 0)
         mel_emit_bit(melp, 1);
 
+      if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
+        *(vlcp->buf - vlcp->pos) = 0x7f;
+        vlcp->pos++;
+        vlcp->tmp >>= 7;
+        vlcp->used_bits -= 7;
+      }
+
       melp->tmp = melp->tmp << melp->remaining_bits;
       int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
       int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
@@ -530,7 +539,7 @@ static void proc_pixel(__m512i *src_vec, ui32 p,
         /*   s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
         val_vec[i] = _mm512_mask_sub_epi32(ZERO, val_mask[i], val_vec[i], ONE);
         _s_vec[i] = _mm512_mask_srli_epi32(ZERO, val_mask[i], src_vec[i], 31);
-        _s_vec[i] = 
+        _s_vec[i] =
           _mm512_mask_add_epi32(ZERO, val_mask[i], _s_vec[i], val_vec[i]);
         /* } */
     }
@@ -562,18 +571,18 @@ static void proc_pixel(__m512i *src_vec, ui32 p,
         ui32 o_idx = i & 0x1;
 
         eq_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _eq_vec[o_idx]);
-        eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00, 
-                                                  idx[e_idx], 
+        eq_vec[i] = _mm512_mask_permutexvar_epi32(eq_vec[i], 0xFF00,
+                                                  idx[e_idx],
                                                   _eq_vec[o_idx + 2]);
 
         s_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], _s_vec[o_idx]);
         s_vec[i] = _mm512_mask_permutexvar_epi32(s_vec[i], 0xFF00,
-                                                 idx[e_idx], 
+                                                 idx[e_idx],
                                                  _s_vec[o_idx + 2]);
 
         _rho_vec[i] = _mm512_permutexvar_epi32(idx[e_idx], val_vec[o_idx]);
         _rho_vec[i] = _mm512_mask_permutexvar_epi32(_rho_vec[i], 0xFF00,
-                                                    idx[e_idx], 
+                                                    idx[e_idx],
                                                     val_vec[o_idx + 2]);
         _rho_vec[i] = _mm512_slli_epi32(_rho_vec[i], i);
 
@@ -686,11 +695,11 @@ static void proc_ms_encode(ms_struct *msp,
         /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
          * cwd_len = m
          */
-        _mm512_store_epi32(cwd_len, m_vec[i]);
+        _mm512_storeu_si512(cwd_len, m_vec[i]);
         tmp = _mm512_sllv_epi32(ONE, m_vec[i]);
         tmp = _mm512_sub_epi32(tmp, ONE);
         tmp = _mm512_and_epi32(tmp, s_vec[i]);
-        _mm512_store_epi32(cwd, tmp);
+        _mm512_storeu_si512(cwd, tmp);
 
         for (ui32 j = 0; j < 8; ++j) {
             ui32 idx = j * 2;
@@ -703,7 +712,7 @@ static void proc_ms_encode(ms_struct *msp,
     }
 }
 
-static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec, 
+static __m512i cal_eps_vec(__m512i *eq_vec, __m512i &u_q_vec,
                            __m512i &e_qmax_vec)
 {
     /* if (u_q[i] > 0) {
@@ -746,7 +755,7 @@ static void update_lep(ui32 x, __m512i &prev_e_val_vec,
      */
     auto tmp = _mm512_mask_permutexvar_epi32(prev_e_val_vec, 0xFFFE,
                                              left_shift, eq_vec[3]);
-    prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift, 
+    prev_e_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
                                                    eq_vec[3]);
     e_val_vec[x] = _mm512_max_epi32(eq_vec[1], tmp);
 }
@@ -760,9 +769,9 @@ static void update_lcxp(ui32 x, __m512i &prev_cx_val_vec,
      * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
      * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
      */
-    auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE, 
+    auto tmp = _mm512_mask_permutexvar_epi32(prev_cx_val_vec, 0xFFFE,
                                              left_shift, rho_vec);
-    prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift, 
+    prev_cx_val_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
                                                     rho_vec);
 
     tmp = _mm512_and_epi32(tmp, _mm512_set1_epi32(8));
@@ -784,7 +793,7 @@ static __m512i cal_tuple(__m512i &cq_vec, __m512i &rho_vec,
     return _mm512_i32gather_epi32(tmp, vlc_tbl, 4);
 }
 
-static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec, 
+static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
                         const __m512i right_shift)
 {
     ojph_unused(x);
@@ -800,8 +809,8 @@ static __m512i proc_cq1(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
 static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
                         const __m512i right_shift)
 {
-    // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2)) 
-    //            | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2)); 
+    // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
+    //            | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
     auto lcxp1_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x]);
     auto lcxp2_vec = _mm512_permutexvar_epi32(right_shift, cx_val_vec[x + 1]);
     auto tmp = _mm512_permutexvar_epi32(right_shift, lcxp1_vec);
@@ -822,7 +831,7 @@ static __m512i proc_cq2(ui32 x, __m512i *cx_val_vec, __m512i &rho_vec,
 
 using fn_proc_cq = __m512i (*)(ui32, __m512i *, __m512i &, const __m512i);
 
-static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec, 
+static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
                              __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
                              const __m512i right_shift)
 {
@@ -840,7 +849,7 @@ static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
 
     /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
     auto mel_need_encode2 = (ui16)_mm512_cmpgt_epi32_mask(u_q_vec, ZERO);
-    mel_need_encode2 = 
+    mel_need_encode2 =
       mel_need_encode2 & (ui16)_mm512_cmpgt_epi32_mask(tmp, ZERO);
 
     ui32 i_max = 16 - (ignore / 2);
@@ -864,7 +873,7 @@ static void proc_mel_encode1(mel_struct *melp, __m512i &cq_vec,
     }
 }
 
-static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec, 
+static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
                              __m512i &rho_vec, __m512i u_q_vec, ui32 ignore,
                              const __m512i right_shift)
 {
@@ -888,10 +897,10 @@ static void proc_mel_encode2(mel_struct *melp, __m512i &cq_vec,
     }
 }
 
-using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &, 
+using fn_proc_mel_encode = void (*)(mel_struct *, __m512i &, __m512i &,
                                     __m512i, ui32, const __m512i);
 
-static void proc_vlc_encode1(vlc_struct *vlcp, ui32 *tuple,
+static void proc_vlc_encode1(vlc_struct_avx512 *vlcp, ui32 *tuple,
                              ui32 *u_q, ui32 ignore)
 {
     ui32 i_max = 16 - (ignore / 2);
@@ -959,7 +968,7 @@ static void proc_vlc_encode1(vlc_struct *vlcp, ui32 *tuple,
     }
 }
 
-static void proc_vlc_encode2(vlc_struct *vlcp, ui32 *tuple,
+static void proc_vlc_encode2(vlc_struct_avx512 *vlcp, ui32 *tuple,
                              ui32 *u_q, ui32 ignore)
 {
     ui32 i_max = 16 - (ignore / 2);
@@ -995,10 +1004,10 @@ static void proc_vlc_encode2(vlc_struct *vlcp, ui32 *tuple,
     }
 }
 
-using fn_proc_vlc_encode = void (*)(vlc_struct *, ui32 *, ui32 *, ui32);
+using fn_proc_vlc_encode = void (*)(vlc_struct_avx512 *, ui32 *, ui32 *, ui32);
 
-void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs, 
-                                  ui32 num_passes, ui32 _width, ui32 height, 
+void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
+                                  ui32 num_passes, ui32 _width, ui32 height,
                                   ui32 stride, ui32* lengths,
                                   ojph::mem_elastic_allocator *elastic,
                                   ojph::coded_lists *& coded)
@@ -1019,7 +1028,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
 
     mel_struct mel;
     mel_init(&mel, mel_size, mel_buf);
-    vlc_struct vlc;
+    vlc_struct_avx512 vlc;
     vlc_init(&vlc, vlc_size, vlc_buf);
     ms_struct ms;
     ms_init(&ms, ms_size, ms_buf);
@@ -1102,7 +1111,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
 
             if (y + 1 < height) {
                 src_vec[1] = _mm512_maskz_loadu_epi32(load_mask0, sp + stride);
-                src_vec[3] = 
+                src_vec[3] =
                   _mm512_maskz_loadu_epi32(load_mask1, sp + 16 + stride);
             } else {
                 src_vec[1] = ZERO;
@@ -1139,7 +1148,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
             tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
             auto cq_vec = _mm512_mask_permutexvar_epi32(prev_cq_vec, 0xFFFE,
                                                         left_shift, tmp);
-            prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift, 
+            prev_cq_vec = _mm512_mask_permutexvar_epi32(ZERO, 0x1, left_shift,
                                                         tmp);
 
             update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
@@ -1154,7 +1163,7 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
             __m512i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
             ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
 
-            proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore, 
+            proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
                             right_shift);
 
             proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
@@ -1168,8 +1177,8 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
              * So in the vlc_encode, the tuple will only be scaled by 2.
              */
             tuple_vec = _mm512_srli_epi32(tuple_vec, 4);
-            _mm512_store_epi32(tuple, tuple_vec);
-            _mm512_store_epi32(u_q, u_q_vec);
+            _mm512_storeu_si512(tuple, tuple_vec);
+            _mm512_storeu_si512(u_q, u_q_vec);
             proc_vlc_encode(&vlc, tuple, u_q, _ignore);
         }
 
@@ -1206,3 +1215,4 @@ void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs,
 } /* namespace local */
 } /* namespace ojph */
 
+#endif
diff --git a/src/core/common/ojph_codestream.h b/src/core/common/ojph_codestream.h
deleted file mode 100644
index 042fe03c..00000000
--- a/src/core/common/ojph_codestream.h
+++ /dev/null
@@ -1,123 +0,0 @@
-//***************************************************************************/
-// This software is released under the 2-Clause BSD license, included
-// below.
-//
-// Copyright (c) 2019, Aous Naman 
-// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
-// Copyright (c) 2019, The University of New South Wales, Australia
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-// 
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// 
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//***************************************************************************/
-// This file is part of the OpenJPH software implementation.
-// File: ojph_codestream.h
-// Author: Aous Naman
-// Date: 28 August 2019
-//***************************************************************************/
-
-
-#ifndef OJPH_CODESTREAM_H
-#define OJPH_CODESTREAM_H
-
-#include <cstdlib>
-
-#include "ojph_defs.h"
-
-namespace ojph {
-
-  ////////////////////////////////////////////////////////////////////////////
-  //local prototyping
-  namespace local {
-    class codestream;
-  };
-
-  ////////////////////////////////////////////////////////////////////////////
-  //defined elsewhere
-  class param_siz;
-  class param_cod;
-  class param_qcd;
-  class comment_exchange;
-  class mem_fixed_allocator;
-  struct point;
-  struct line_buf;
-  class outfile_base;
-  class infile_base;
-
-  ////////////////////////////////////////////////////////////////////////////
-  class codestream
-  {
-  public:
-    OJPH_EXPORT
-    codestream();
-    OJPH_EXPORT
-    ~codestream();
-
-    OJPH_EXPORT
-    void set_planar(bool planar);
-    OJPH_EXPORT
-    void set_profile(const char* s);
-    OJPH_EXPORT    
-    void set_tilepart_divisions(bool at_resolutions, bool at_components);
-    OJPH_EXPORT    
-    void request_tlm_marker(bool needed);    
-
-    OJPH_EXPORT
-    void write_headers(outfile_base *file, 
-                       const comment_exchange* comments = NULL, 
-                       ui32 num_comments = 0);
-    OJPH_EXPORT
-    line_buf* exchange(line_buf* line, ui32& next_component);
-    OJPH_EXPORT
-    void flush();
-
-    OJPH_EXPORT
-    void enable_resilience();             // before read_headers
-    OJPH_EXPORT
-    void read_headers(infile_base *file); // before resolution restrictions
-    OJPH_EXPORT
-    void restrict_input_resolution(ui32 skipped_res_for_data,
-      ui32 skipped_res_for_recon);         // before create
-    OJPH_EXPORT
-    void create(); 
-    OJPH_EXPORT
-    line_buf* pull(ui32 &comp_num);
-
-    OJPH_EXPORT
-    void close();
-
-    OJPH_EXPORT
-    param_siz access_siz();
-    OJPH_EXPORT
-    param_cod access_cod();
-    OJPH_EXPORT
-    param_qcd access_qcd();
-    OJPH_EXPORT
-    bool is_planar() const;
-
-  private:
-    local::codestream* state;
-  };
-
-}
-
-#endif // !OJPH_CODESTREAM_H
diff --git a/src/core/common/ojph_message.h b/src/core/common/ojph_message.h
deleted file mode 100644
index 45e3d7d7..00000000
--- a/src/core/common/ojph_message.h
+++ /dev/null
@@ -1,135 +0,0 @@
-//***************************************************************************/
-// This software is released under the 2-Clause BSD license, included
-// below.
-//
-// Copyright (c) 2019, Aous Naman
-// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
-// Copyright (c) 2019, The University of New South Wales, Australia
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//***************************************************************************/
-// This file is part of the OpenJPH software implementation.
-// File: ojph_message.h
-// Author: Aous Naman
-// Date: 29 August 2019
-//***************************************************************************/
-
-#ifndef OJPH_MESSAGE_H
-#define OJPH_MESSAGE_H 
-
-#include <cstring>
-#include "ojph_arch.h"
-
-namespace ojph {
-
-  ////////////////////////////////////////////////////////////////////////////
-  enum OJPH_MSG_LEVEL : int
-  {
-    NO_MSG = 0,
-    INFO = 1,
-    WARN = 2,
-    ERROR = 3
-  };
-
-  ////////////////////////////////////////////////////////////////////////////
-  class message_base {
-  public:
-    OJPH_EXPORT
-      virtual void operator() (int warn_code, const char* file_name,
-        int line_num, const char *fmt, ...) = 0;
-  };
-
-  ////////////////////////////////////////////////////////////////////////////
-  class message_info : public message_base
-  {
-    public:
-      OJPH_EXPORT
-      virtual void operator() (int info_code, const char* file_name,
-        int line_num, const char* fmt, ...);
-  };
-
-  ////////////////////////////////////////////////////////////////////////////
-  OJPH_EXPORT
-    void set_info_stream(FILE* s);
-  OJPH_EXPORT
-    void configure_info(message_info* info);
-  OJPH_EXPORT
-    message_info& get_info();
-
-  ////////////////////////////////////////////////////////////////////////////
-  class message_warning : public message_base
-  {
-    public:
-      OJPH_EXPORT
-      virtual void operator() (int warn_code, const char* file_name,
-        int line_num, const char* fmt, ...);
-  };
-
-  ////////////////////////////////////////////////////////////////////////////
-  OJPH_EXPORT
-    void set_warning_stream(FILE* s);
-  OJPH_EXPORT
-    void configure_warning(message_warning* warn);
-  OJPH_EXPORT
-    message_warning& get_warning();
-
-  ////////////////////////////////////////////////////////////////////////////
-  class message_error : public message_base
-  {
-    public:
-      OJPH_EXPORT
-      virtual void operator() (int warn_code, const char* file_name,
-        int line_num, const char *fmt, ...);
-  };
-
-  ////////////////////////////////////////////////////////////////////////////
-  OJPH_EXPORT
-  void set_error_stream(FILE *s);
-  OJPH_EXPORT
-  void configure_error(message_error* error);
-  OJPH_EXPORT
-  message_error& get_error();
-}
-
-//////////////////////////////////////////////////////////////////////////////
-#if (defined OJPH_OS_WINDOWS)
-  #define __OJPHFILE__ \
-    (strrchr(__FILE__, '\\') ? strrchr(__FILE__, '\\') + 1 : __FILE__)
-#else
-  #define __OJPHFILE__ \
-    (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-#define OJPH_INFO(t, ...) \
-  { ojph::get_info()(t, __OJPHFILE__, __LINE__, __VA_ARGS__); }
-//////////////////////////////////////////////////////////////////////////////
-#define OJPH_WARN(t, ...) \
-  { ojph::get_warning()(t, __OJPHFILE__, __LINE__, __VA_ARGS__); }
-//////////////////////////////////////////////////////////////////////////////
-#define OJPH_ERROR(t, ...) \
-  { ojph::get_error()(t, __OJPHFILE__, __LINE__,__VA_ARGS__); }
-
-
-#endif // !OJPH_MESSAGE_H
diff --git a/src/core/common/ojph_params.h b/src/core/common/ojph_params.h
deleted file mode 100644
index d17e8d2c..00000000
--- a/src/core/common/ojph_params.h
+++ /dev/null
@@ -1,202 +0,0 @@
-//***************************************************************************/
-// This software is released under the 2-Clause BSD license, included
-// below.
-//
-// Copyright (c) 2019, Aous Naman 
-// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
-// Copyright (c) 2019, The University of New South Wales, Australia
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-// 
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// 
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//***************************************************************************/
-// This file is part of the OpenJPH software implementation.
-// File: ojph_params.h
-// Author: Aous Naman
-// Date: 28 August 2019
-//***************************************************************************/
-
-
-#ifndef OJPH_PARAMS_H
-#define OJPH_PARAMS_H
-
-#include "ojph_base.h"
-
-namespace ojph {
-
-  ////////////////////////////////////////////////////////////////////////////
-  //prototyping from local
-  namespace local {
-    struct param_siz;
-    struct param_cod;
-    struct param_qcd;
-    struct param_qcc;
-    struct param_cap;
-    class codestream;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////
-  class param_siz
-  {
-  public:
-    OJPH_EXPORT
-    param_siz(local::param_siz *p) : state(p) {}
-
-    //setters
-    OJPH_EXPORT
-    void set_image_extent(point extent);
-    OJPH_EXPORT
-    void set_tile_size(size s);
-    OJPH_EXPORT
-    void set_image_offset(point offset);
-    OJPH_EXPORT
-    void set_tile_offset(point offset);
-    OJPH_EXPORT
-    void set_num_components(ui32 num_comps);
-    OJPH_EXPORT
-    void set_component(ui32 comp_num, const point& downsampling,
-                       ui32 bit_depth, bool is_signed);
-
-    //getters
-    OJPH_EXPORT
-    point get_image_extent() const;
-    OJPH_EXPORT
-    point get_image_offset() const;
-    OJPH_EXPORT
-    size get_tile_size() const;
-    OJPH_EXPORT
-    point get_tile_offset() const;
-    OJPH_EXPORT
-    ui32 get_num_components() const;
-    OJPH_EXPORT
-    ui32 get_bit_depth(ui32 comp_num) const;
-    OJPH_EXPORT
-    bool is_signed(ui32 comp_num) const;
-    OJPH_EXPORT
-    point get_downsampling(ui32 comp_num) const;
-
-    //deeper getters
-    OJPH_EXPORT
-    ui32 get_recon_width(ui32 comp_num) const;
-    OJPH_EXPORT
-    ui32 get_recon_height(ui32 comp_num) const;
-
-  private:
-    local::param_siz* state;
-  };
-
-  ////////////////////////////////////////////////////////////////////////////
-  class param_cod
-  {
-  public:
-    OJPH_EXPORT
-    param_cod(local::param_cod* p) : state(p) {}
-
-    OJPH_EXPORT
-    void set_num_decomposition(ui32 num_decompositions);
-    OJPH_EXPORT
-    void set_block_dims(ui32 width, ui32 height);
-    OJPH_EXPORT
-    void set_precinct_size(int num_levels, size* precinct_size);
-    OJPH_EXPORT
-    void set_progression_order(const char *name);
-    OJPH_EXPORT
-    void set_color_transform(bool color_transform);
-    OJPH_EXPORT
-    void set_reversible(bool reversible);
-
-    OJPH_EXPORT
-    ui32 get_num_decompositions() const;
-    OJPH_EXPORT
-    size get_block_dims() const;
-    OJPH_EXPORT
-    size get_log_block_dims() const;
-    OJPH_EXPORT
-    bool is_reversible() const;
-    OJPH_EXPORT
-    size get_precinct_size(ui32 level_num) const;
-    OJPH_EXPORT
-    size get_log_precinct_size(ui32 level_num) const;
-    OJPH_EXPORT
-    int get_progression_order() const;
-    OJPH_EXPORT
-    const char* get_progression_order_as_string() const;
-    OJPH_EXPORT
-    int get_num_layers() const;
-    OJPH_EXPORT
-    bool is_using_color_transform() const;
-    OJPH_EXPORT
-    bool packets_may_use_sop() const;
-    OJPH_EXPORT
-    bool packets_use_eph() const;
-    OJPH_EXPORT
-    bool get_block_vertical_causality() const;
-
-  private:
-    local::param_cod* state;
-  };
-
-  ////////////////////////////////////////////////////////////////////////////
-  class param_qcd
-  {
-  public:
-    OJPH_EXPORT
-    param_qcd(local::param_qcd* p) : state(p) {}
-
-    OJPH_EXPORT
-    void set_irrev_quant(float delta);
-
-  private:
-    local::param_qcd* state;
-  };
-
-  ////////////////////////////////////////////////////////////////////////////
-  class comment_exchange
-  {
-    friend class local::codestream;
-  public:
-    comment_exchange() : data(NULL), len(0), Rcom(0) {}
-    OJPH_EXPORT
-    void set_string(char* str);
-    OJPH_EXPORT
-    void set_data(char* data, ui16 len);
-
-  private:
-    char* data;
-    ui16 len;
-    ui16 Rcom;
-  };
-
-  ////////////////////////////////////////////////////////////////////////////
-  //class param_qcc
-  //{
-  //public:
-  //  OJPH_EXPORT
-  //  param_qcc(local::param_qcc* p) : state(p) {}
-
-  //private:
-  //  local::param_qcc* state;
-  //};
-
-}
-
-#endif // !OJPH_PARAMS_H
diff --git a/src/core/common/ojph_arch.h b/src/core/openjph/ojph_arch.h
similarity index 67%
rename from src/core/common/ojph_arch.h
rename to src/core/openjph/ojph_arch.h
index 62b630bb..097c46f1 100644
--- a/src/core/common/ojph_arch.h
+++ b/src/core/openjph/ojph_arch.h
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -63,7 +63,39 @@
 #include <intrin.h>
 #endif
 
+///////////////////////////////////////////////////////////////////////////////
+// preprocessor directives for architecture
+///////////////////////////////////////////////////////////////////////////////
+#if defined(__arm__) || defined(__TARGET_ARCH_ARM)  \
+  || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+  #define OJPH_ARCH_ARM
+#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
+  #define OJPH_ARCH_I386
+#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) \
+  || defined(_M_X64)
+  #define OJPH_ARCH_X86_64
+#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+  #define OJPH_ARCH_IA64
+#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__)  \
+  || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC)  \
+  || defined(_M_MPPC) || defined(_M_PPC)
+  #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
+    #define OJPH_ARCH_PPC64
+  #else
+    #define OJPH_ARCH_PPC
+  #endif
+#else
+  #define OJPH_ARCH_UNKNOWN
+#endif
+
 namespace ojph {
+  ////////////////////////////////////////////////////////////////////////////
+  //                  disable SIMD for unknown architecture
+  ////////////////////////////////////////////////////////////////////////////
+#if !defined(OJPH_ARCH_X86_64) && !defined(OJPH_ARCH_I386) &&  \
+    !defined(OJPH_ARCH_ARM) && !defined(OJPH_DISABLE_SIMD)
+#define OJPH_DISABLE_SIMD
+#endif // !OJPH_ARCH_UNKNOWN
 
   ////////////////////////////////////////////////////////////////////////////
   //                         OS detection definitions
@@ -72,8 +104,14 @@ namespace ojph {
 #define OJPH_OS_WINDOWS
 #elif (defined __APPLE__)
 #define OJPH_OS_APPLE
+#elif (defined __ANDROID__)
+#define OJPH_OS_ANDROID
 #elif (defined __linux)
 #define OJPH_OS_LINUX
+#elif (defined __FreeBSD__)
+#define OJPH_OS_FREEBSD
+#elif (defined __OpenBSD__)
+#define OJPH_OS_OPENBSD
 #endif
 
   /////////////////////////////////////////////////////////////////////////////
@@ -106,10 +144,19 @@ namespace ojph {
     X86_CPU_EXT_LEVEL_AVX512 = 11,
   };
 
+  enum : int {
+    ARM_CPU_EXT_LEVEL_GENERIC = 0,
+    ARM_CPU_EXT_LEVEL_NEON = 1,
+    ARM_CPU_EXT_LEVEL_ASIMD = 1,
+    ARM_CPU_EXT_LEVEL_SVE = 2,
+    ARM_CPU_EXT_LEVEL_SVE2 = 3,
+  };
+
   /////////////////////////////////////////////////////////////////////////////
   static inline ui32 population_count(ui32 val)
   {
-  #ifdef OJPH_COMPILER_MSVC
+  #if defined(OJPH_COMPILER_MSVC)  \
+    && (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
     return (ui32)__popcnt(val);
   #elif (defined OJPH_COMPILER_GNUC)
     return (ui32)__builtin_popcount(val);
@@ -145,6 +192,47 @@ namespace ojph {
   #endif
   }
 
+  /////////////////////////////////////////////////////////////////////////////
+#ifdef OJPH_COMPILER_MSVC
+  #if (defined OJPH_ARCH_X86_64 || defined OJPH_ARCH_ARM)
+    #pragma intrinsic(_BitScanReverse64)
+  #elif (defined OJPH_ARCH_I386)
+    #pragma intrinsic(_BitScanReverse)
+  #else
+    #error Error unsupport MSVC version
+  #endif
+#endif
+  static inline ui32 count_leading_zeros(ui64 val)
+  {
+  #ifdef OJPH_COMPILER_MSVC
+    unsigned long result = 0;
+    #if (defined OJPH_ARCH_X86_64) || (defined OJPH_ARCH_ARM)
+      _BitScanReverse64(&result, val);
+    #elif (defined OJPH_ARCH_I386)
+      ui32 msb = (ui32)(val >> 32), lsb = (ui32)val;
+      if (msb == 0)
+        _BitScanReverse(&result, lsb);
+      else {
+        _BitScanReverse(&result, msb);
+        result += 32;
+      }
+    #else
+      #error Error unsupport MSVC version
+    #endif
+    return 63 ^ (ui32)result;
+  #elif (defined OJPH_COMPILER_GNUC)
+    return (ui32)__builtin_clzll(val);
+  #else
+    val |= (val >> 1);
+    val |= (val >> 2);
+    val |= (val >> 4);
+    val |= (val >> 8);
+    val |= (val >> 16);
+    val |= (val >> 32);
+    return 64 - population_count64(val);
+  #endif
+  }
+
   /////////////////////////////////////////////////////////////////////////////
 #ifdef OJPH_COMPILER_MSVC
   #pragma intrinsic(_BitScanForward)
@@ -194,13 +282,15 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   // constants
   ////////////////////////////////////////////////////////////////////////////
-#ifdef OJPH_ENABLE_INTEL_AVX512
-  const ui32 byte_alignment = 64; //64 bytes == 512 bits
-#else
-  const ui32 byte_alignment = 32; //32 bytes == 256 bits
-#endif
-  const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment);
-  const ui32 object_alignment = 8;
+  #ifndef OJPH_EMSCRIPTEN
+    const ui32 byte_alignment = 64; // 64 bytes == 512 bits
+    const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment);
+    const ui32 object_alignment = 8;
+  #else
+    const ui32 byte_alignment = 16; // 16 bytes == 128 bits
+    const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment);
+    const ui32 object_alignment = 8;
+    #endif
 
   ////////////////////////////////////////////////////////////////////////////
   // templates for alignment
@@ -208,17 +298,17 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   // finds the size such that it is a multiple of byte_alignment
-  template <typename T, int N>
+  template <typename T, ui32 N>
   size_t calc_aligned_size(size_t size) {
     size = size * sizeof(T) + N - 1;
     size &= ~((1ULL << (31 - count_leading_zeros(N))) - 1);
-    size >>= (31 - count_leading_zeros(sizeof(T)));
+    size >>= (63 - count_leading_zeros((ui64)sizeof(T)));
     return size;
   }
 
   ////////////////////////////////////////////////////////////////////////////
   // moves the pointer to first address that is a multiple of byte_alignment
-  template <typename T, int N>
+  template <typename T, ui32 N>
   inline T *align_ptr(T *ptr) {
     intptr_t p = reinterpret_cast<intptr_t>(ptr);
     p += N - 1;
diff --git a/src/core/common/ojph_arg.h b/src/core/openjph/ojph_arg.h
similarity index 96%
rename from src/core/common/ojph_arg.h
rename to src/core/openjph/ojph_arg.h
index 6cac09d1..5743f950 100644
--- a/src/core/common/ojph_arg.h
+++ b/src/core/openjph/ojph_arg.h
@@ -201,6 +201,17 @@ namespace ojph {
       }
     }
 
+    ///////////////////////////////////////////////////////////////////////////
+    bool reinterpret(const char *str) {
+      argument t = find_argument(str);
+      if (t.is_valid()) {
+        release_argument(t);
+        return true;
+      }
+      else
+        return false;
+    }    
+
     ///////////////////////////////////////////////////////////////////////////
     void reinterpret_to_bool(const char *str, int& val) {
       argument t = find_argument(str);
diff --git a/src/core/common/ojph_base.h b/src/core/openjph/ojph_base.h
similarity index 100%
rename from src/core/common/ojph_base.h
rename to src/core/openjph/ojph_base.h
diff --git a/src/core/openjph/ojph_codestream.h b/src/core/openjph/ojph_codestream.h
new file mode 100644
index 00000000..772415d4
--- /dev/null
+++ b/src/core/openjph/ojph_codestream.h
@@ -0,0 +1,387 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman
+// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_codestream.h
+// Author: Aous Naman
+// Date: 28 August 2019
+//***************************************************************************/
+
+
+#ifndef OJPH_CODESTREAM_H
+#define OJPH_CODESTREAM_H
+
+#include <cstdlib>
+
+#include "ojph_arch.h"
+#include "ojph_defs.h"
+
+namespace ojph {
+
+  ////////////////////////////////////////////////////////////////////////////
+  //local prototyping
+  namespace local {
+    class codestream;
+  };
+
+  ////////////////////////////////////////////////////////////////////////////
+  //defined elsewhere
+  class param_siz;
+  class param_cod;
+  class param_qcd;
+  class param_nlt;
+  class comment_exchange;
+  class mem_fixed_allocator;
+  struct point;
+  class line_buf;
+  class outfile_base;
+  class infile_base;
+
+  ////////////////////////////////////////////////////////////////////////////
+  /**
+   *  @brief The object represent a codestream.
+   *
+   *  Most end-user use this object to create a j2c codestream.  The object
+   *  currently can be used in one of two modes, reading or writing.
+   *
+   *  We try to follow the pImpl (pointer to Implementation) approach;
+   *  therefore, objects in the ojph namespace hold pointers to internal
+   *  implementations.  The actual implementation is usually in the
+   *  ojph::local namespace. The actual implementation of the
+   *  ojph::codestream object is in ojph_codestream.cpp, while the actual
+   *  implementation can be found in ojph_codestream_local.h and
+   *  ojph_codestream_local.cpp.
+   *
+   *  Most of these member functions provides nothing more than calls
+   *  into the internal implementation.  See ojph_codestream_local.h for
+   *  more documentation -- yet to be added.
+   *
+   */
+  class OJPH_EXPORT codestream
+  {
+  public:
+    /**
+     *  @brief default constructor
+     *
+     *  This function instantiate the actual implementation object
+     *  local::codestream, using new.
+     *
+     */
+    codestream();
+
+    /**
+     *  @brief default destructor
+     *
+     *  This function destroys the internal local::codestream object.
+     *
+     */
+    ~codestream();
+
+    /**
+     *  @brief Restarts the codestream.
+     *
+     *  This function restarts the codestream; after this call, the
+     *  codestream object behaves like it has never been used before,
+     *  except that all memory allocations are preserved.  Thus, after
+     *  restart(), there is no need to allocate memory, unless the new
+     *  codestream needs more storage to store codeblocks, or has a
+     *  different structure.
+     *
+     *  restart() is useful if we are decoding multiple codestreams that
+     *  have largely the same structure and byte length.
+     *
+     */
+    void restart();
+
+    /**
+     *  @brief Sets the sequence of pushing or pull rows from the machinery.
+     *
+     *  For this function, planar means that the machinery processes one
+     *  colour component in full before processing the next component.  This
+     *  more efficient because the cache is used for one component instead of
+     *  many components, but it is not practical when a color transform is
+     *  employed. This is because we need to employ the transform to the first
+     *  three  components.  Therefore, planar, while recommended, can only be
+     *  used when there is no color transform.
+     *
+     *  @param planar true for when components are pushed in full one at
+     *         a time.
+     */
+    void set_planar(bool planar);
+
+    /**
+     *  @brief Sets the codestream profile.
+     *
+     *  This is currently rather incomplete, but it accepts two profiles
+     *  IMF and BROADCAST. More work is needed to improve this.
+     *  Note that Rsiz field in the SIZ marker segment is not set properly.
+     *
+     *  @param s a string of the profile name, value can be from
+     *           OJPH_PN_STRING_XXXX, where only IMF and BROADCAST
+     *           are currently supported.
+     */
+    void set_profile(const char* s);
+
+    /**
+     *  @brief Sets the locations where a tile is partitioned into tile parts.
+     *
+     *  This function signals that we are interested in partitioning each tile
+     *  into tile parts at resolution or component level, or both.  This is
+     *  useful when used with the TLM marker segment, because the TLM marker
+     *  segment provides information about the locations of these partitions in
+     *  the file.  This way we can identify where resolution information ends
+     *  within the codestream.  It is also useful when large images are
+     *  compressed, because an unpartitioned tile cannot be more than 4GB, but
+     *  when partitioned, each tile part can be 4GB -- it is possible to
+     *  partition at precinct boundaries to better utilize tile parts, and
+     *  achieve a tile in the vicinity of 1TB, but this option is currently
+     *  unsupported.
+     *
+     *  @param at_resolutions partitions the tile into tile parts at
+     *         resolutions.
+     *  @param at_components partitions every tile into tile parts are
+     *         components
+     */
+    void set_tilepart_divisions(bool at_resolutions, bool at_components);
+
+    /**
+     *  @brief Query if the tile will be partitioned at resolution boundary.
+     *
+     *  @return true if resolution-boundary tile partitioning is employed.
+     *  @return false if resolution-boundary tile partitioning is not
+     *          requested.
+     */
+    bool is_tilepart_division_at_resolutions();
+
+    /**
+     *  @brief Query if the tile will be partitioned at component boundary.
+     *
+     *  @return true if component-boundary tile partitioning is employed.
+     *  @return false if component-boundary tile partitioning is not
+     *          requested.
+     */
+    bool is_tilepart_division_at_components();
+
+    /**
+     *  @brief Request the addition of the optional TLM marker segment.
+     *  This request should occur before writing codestream headers
+     *  ojph::codestream::write_headers())
+     *
+     *  @param needed true when the marker is needed.
+     */
+    void request_tlm_marker(bool needed);
+
+    /**
+     *  @brief Query if the optional TLM marker segment is to be added.
+     *
+     *  @return true if the addition of the optional TLM marker segment
+     *          is to be added.
+     *  @return false if the addition of the optional TLM marker segment
+     *          was not requested.
+     */
+
+    bool is_tlm_requested();
+
+    /**
+     *  @brief Writes codestream headers when the codestream is used for
+     *  writing.  This function should be called after setting all the
+     *  codestream parameters, but before pushing image lines using
+     *  ojph::codestream::exchange().
+     *
+     *  @param file A class inherited from outfile_base, which used to store
+     *              compressed image bitstream.  This enables storing the
+     *              compressed bitstream to memory or an actual file.
+     *  @param comments A pointer to an array of comment_exchange objects.
+     *                  Each object stores one comment to be inserted in the
+     *                  bitstreams.  The number of elements in the array
+     *                  should be equal to num_comments.
+     *  @param num_comments The number of elements in the `comments` array.
+     *
+     */
+    void write_headers(outfile_base *file,
+                       const comment_exchange* comments = NULL,
+                       ui32 num_comments = 0);
+
+    /**
+     *  @brief This call is used to send image data rows to the library.
+     *         We expect to send one row from a single component with
+     *         each call. The first call is always with line == NULL;
+     *         the call would return a line_buf, and the component
+     *         number or index in `next_component.`  The caller would
+     *         then need to fill the buffer of the line_buf with one
+     *         row from the component indexed by `next_component`, and
+     *         call exchange again to pass the component and get a
+     *         new line_buf.
+     *
+     *  @param line A line_buf object; first call should supply NULL.
+     *              Subsequent calls should pass the line_buf object
+     *              obtained in the previous call.
+     *  @param next_component returns a component index; the end user must
+     *                        fill the returned line_buf from the component
+     *                        indexed by this index.
+     *  @return line_buf* A line_buf which must be filled with the component
+     *                    indexed by `next_component`, before calling
+     *                    exchange again to pass this line.
+     */
+
+    line_buf* exchange(line_buf* line, ui32& next_component);
+
+    /**
+     * @brief This is the last call to a writing (encoding) codestream.
+     *        This will write encoded bitstream data to the file.  This
+     *        call does not close the file, because, in the future, we
+     *        might wish to write more data to the file.  If you do not
+     *        want to write more data, then call codestream::close().
+     */
+    void flush();
+
+    /**
+     * @brief This enables codestream resilience; that is, the library tries
+     *        its best to decode the codestream, even if there are errors.
+     *        This call is for a decoding (or reading) codestream, and
+     *        should be called before all other calls, before
+     *        codestream::read_headers().
+     */
+    void enable_resilience();             // before read_headers
+
+    /**
+     * @brief This call reads the headers of a codestream.  It is for a
+     *        reading (or decoding) codestream, and should be called
+     *        after codestream::enable_resilience(), but before
+     *        codestream::restrict_input_resolution().
+     *
+     * @param file The file to read from.  The file should be inherited from
+     *             ojph::infile_base; this enables reading from an actual file
+     *             or from memory-based file.
+     */
+    void read_headers(infile_base *file); // before resolution restrictions
+
+    /**
+     * @brief This function restricts resolution decoding for a codestream.
+     *        It is for a reading (decoding) codestream.  We can limit the
+     *        restrictions to decoding and reconstruction resolution,
+     *        or decoding only.  Call this function after
+     *        codestream::read_headers() but before codestream::create()
+     *
+     * @param skipped_res_for_data specifies for how many fine resolutions
+     *                             decoding is skipped, i.e., reading and
+     *                             decoding is not performed for this number
+     *                             of fine resolutions.
+     * @param skipped_res_for_recon specifies for how many fine resolutions
+     *                              reconstruction is skipped; the resulting
+     *                              image is smaller than the original.  This
+     *                              number should be smaller or equal to
+     *                              `skipped_res_for_data,` as it does not
+     *                              make sense otherwise.
+     */
+    void restrict_input_resolution(ui32 skipped_res_for_data,
+                                   ui32 skipped_res_for_recon); //before create
+
+    /**
+     * @brief This call is for a decoding (or reading) codestream.  Call this
+     *        function after calling restrict_input_resolution(), if
+     *        restrictions are needed.
+     */
+    void create();
+
+    /**
+     * @brief This call is to pull one row from the codestream, being
+     *        decoded.  The returned line_buf object holds one row from
+     *        the image; the returned comp_num tells the reader the
+     *        component to which this row belongs.
+     *
+     * @param comp_num returns the component to which the returned
+     *                 line_buf object belongs.
+     * @return line_buf* this object holds one row of the component indexed
+     *                   by comp_num.
+     */
+    line_buf* pull(ui32 &comp_num);
+
+    /**
+     * @brief Call this function to close the underlying file; works for both
+     *        encoding and decoding codestreams.
+     *
+     */
+    void close();
+
+    /**
+     * @brief Returns the underlying SIZ marker segment object
+     *
+     * @return param_siz This object holds SIZ marker segment information,
+     *                   which deals with codestream dimensions, number
+     *                   of components, bit depth, ... etc.
+     */
+    param_siz access_siz();
+
+    /**
+     * @brief Returns the underlying COD marker segment object
+     *
+     * @return param_cod This object holds COD marker segment information,
+     *                   which deals with coding parameters, such as
+     *                   codeblock sizes, progression order, reversible,
+     *                   ... etc.
+     */
+    param_cod access_cod();
+
+    /**
+     * @brief Returns the underlying QCD marker segment object
+     *
+     * @return param_qcd This object holds QCD marker segment information,
+     *                   which deals with quantization parameters --
+     *                   quantization step size for each subband.
+     */
+    param_qcd access_qcd();
+
+    /**
+     * @brief Returns the underlying NLT marker segment object
+     *
+     * @return param_nlt This object holds NLT marker segment information,
+     *                   which deals with non-linearity point transformation
+     *                   for each component.
+     */
+    param_nlt access_nlt();
+
+    /**
+     * @brief Query if the codestream extraction is planar or not.
+     * See the documentation for ojph::codestream::set_planar()
+     *
+     * @return true if it is planar
+     * @return false if it is not planar (interleaved)
+     */
+    bool is_planar() const;
+
+  private:
+    local::codestream* state;
+  };
+
+}
+
+#endif // !OJPH_CODESTREAM_H
diff --git a/src/core/common/ojph_defs.h b/src/core/openjph/ojph_defs.h
similarity index 98%
rename from src/core/common/ojph_defs.h
rename to src/core/openjph/ojph_defs.h
index 67221641..d9a5d2cb 100644
--- a/src/core/common/ojph_defs.h
+++ b/src/core/openjph/ojph_defs.h
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
diff --git a/src/core/common/ojph_file.h b/src/core/openjph/ojph_file.h
similarity index 55%
rename from src/core/common/ojph_file.h
rename to src/core/openjph/ojph_file.h
index 7faa6b0f..72d99310 100644
--- a/src/core/common/ojph_file.h
+++ b/src/core/openjph/ojph_file.h
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -71,37 +71,40 @@ namespace ojph {
 
 
   ////////////////////////////////////////////////////////////////////////////
-  class outfile_base
+  class OJPH_EXPORT outfile_base
   {
   public:
-
+  public:
+    enum seek : int {
+      OJPH_SEEK_SET = SEEK_SET,
+      OJPH_SEEK_CUR = SEEK_CUR,
+      OJPH_SEEK_END = SEEK_END
+    };
     virtual ~outfile_base() {}
 
     virtual size_t write(const void *ptr, size_t size) = 0;
     virtual si64 tell() { return 0; }
+    virtual int seek(si64 offset, enum outfile_base::seek origin)
+    {
+      ojph_unused(offset); ojph_unused(origin);
+      return -1; /* always fail, to remind you to write an implementation */
+    }
     virtual void flush() {}
     virtual void close() {}
   };
 
   ////////////////////////////////////////////////////////////////////////////
-  class j2c_outfile : public outfile_base
+  class OJPH_EXPORT j2c_outfile : public outfile_base
   {
   public:
-    OJPH_EXPORT
     j2c_outfile() { fh = 0; }
-    OJPH_EXPORT
-    ~j2c_outfile() { if (fh) fclose(fh); }
+    ~j2c_outfile() override { if (fh) fclose(fh); }
 
-    OJPH_EXPORT
     void open(const char *filename);
-    OJPH_EXPORT
-    virtual size_t write(const void *ptr, size_t size);
-    OJPH_EXPORT
-    virtual si64 tell();
-    OJPH_EXPORT
-    virtual void flush();
-    OJPH_EXPORT
-    virtual void close();
+    size_t write(const void *ptr, size_t size) override;
+    si64 tell() override;
+    void flush() override;
+    void close() override;
 
   private:
     FILE *fh;
@@ -120,82 +123,147 @@ namespace ojph {
    *
    *  memory data can be accessed using get_data()
    */
-  class mem_outfile : public outfile_base
+  class OJPH_EXPORT mem_outfile : public outfile_base
   {
   public:
     /**  A constructor */
-    OJPH_EXPORT
     mem_outfile();
     /**  A destructor */
-    OJPH_EXPORT
-    ~mem_outfile();
-
-    /**  Call this function to open a memory file.
-	 *
+    ~mem_outfile() override;
+
+    mem_outfile(mem_outfile const&) = delete;
+    mem_outfile& operator=(mem_outfile const&) = delete;
+
+    /**
+     * Move construction leaves the moved-from value in default constructed state
+     * and transfers ownership of the internal state to the moved-to instance.
+     **/
+    mem_outfile(mem_outfile &&) noexcept;
+    /**
+     * move assignment with the same ownership transfer semantics as
+     * move construction.
+     **/
+    mem_outfile& operator=(mem_outfile&&) noexcept;
+
+    /**
+     *  @brief Call this function to open a memory file.
+	   *
      *  This function creates a memory buffer to be used for storing
      *  the generated j2k codestream.
      *
      *  @param initial_size is the initial memory buffer size.
      *         The default value is 2^16.
+     *  @param clear_mem if set to true, all allocated memory is reset to 0
      */
-    OJPH_EXPORT
-    void open(size_t initial_size = 65536);
+    void open(size_t initial_size = 65536, bool clear_mem = false);
 
-    /**  Call this function to write data to the memory file.
-	 *
+    /**
+     *  @brief Call this function to write data to the memory file.
+	   *
      *  This function adds new data to the memory file.  The memory buffer
      *  of the file grows as needed.
      *
-     *  @param ptr is the address of the new data.
+     *  @param ptr is a pointer to new data.
      *  @param size the number of bytes in the new data.
      */
-    OJPH_EXPORT
-    virtual size_t write(const void *ptr, size_t size);
+    size_t write(const void *ptr, size_t size) override;
 
-    /** Call this function to know the file size (i.e., number of bytes used
-     *  to store the file).
+    /**
+     *  @brief Call this function to know the file size (i.e., number of
+     *         bytes used to store the file).
      *
      *  @return the file size.
      */
-    OJPH_EXPORT
-    virtual si64 tell() { return cur_ptr - buf; }
+    si64 tell() override { return cur_ptr - buf; }
+
+    /**
+     *  @brief Call this function to change write pointer location; the
+     *         function can expand file storage.
+     *
+     *  @return 0 on success, non-zero otherwise (not used).
+     */
+    int seek(si64 offset, enum outfile_base::seek origin) override;
 
     /** Call this function to close the file and deallocate memory
-	 *
+	   *
      *  The object can be used again after calling close
      */
-    OJPH_EXPORT
-    virtual void close();
+    void close() override;
 
-    /** Call this function to access memory file data.
-	 *
+    /**
+     *  @brief Call this function to access memory file data.
+	   *
      *  It is not recommended to store the returned value because buffer
      *  storage address can change between write calls.
      *
      *  @return a constant pointer to the data.
      */
-    OJPH_EXPORT
     const ui8* get_data() { return buf; }
 
-    /** Call this function to access memory file data (for const objects)
-	 *
+    /**
+     *  @brief Call this function to access memory file data (for const
+     *         objects)
+	   *
      *  This is similar to the above function, except that it can be used
      *  with constant objects.
      *
      *  @return a constant pointer to the data.
      */
-    OJPH_EXPORT
     const ui8* get_data() const { return buf; }
 
+    /**
+     *  @brief Call this function to write the memory file data to a file
+	   *
+     */
+    void write_to_file(const char *file_name) const;
+
+    /**
+     *  @brief Call this function to get the used size of the memory file.
+     *
+     *  @return the used size of the memory file in bytes.
+     */
+    size_t get_used_size() const { return used_size; }
+
+    /**
+     *  @brief Call this function to get the total buffer size of the memory
+     *         file including unused space (this is the allocated memory).
+     *
+     *  @return the full size of the memory file in bytes.
+     */
+     size_t get_buf_size() const { return buf_size; }
+
+  private:
+
+    /**
+     * @brief A utility function to swap the contents of two instances
+     */
+    void swap(mem_outfile& other) noexcept;
+  
+    /**
+     *  @brief This function expands storage by x1.5 needed space.
+     *
+     *  It sets cur_ptr correctly, and clears the extended area of the
+     *  buffer.  It optionally clear the whole buffer
+     *
+     * @param new_size   New size of the buffer
+     * @param clear_all  Set to true to clear whole buffer, not just expansion
+     */
+    void expand_storage(size_t new_size, bool clear_all);
+
   private:
     bool is_open;
+    bool clear_mem;
     size_t buf_size;
+    size_t used_size;
     ui8 *buf;
     ui8 *cur_ptr;
+
+  private:
+    static const size_t ALIGNED_ALLOC_MASK = 4096 - 1;
   };
 
   ////////////////////////////////////////////////////////////////////////////
-  class infile_base
+  class OJPH_EXPORT infile_base
   {
   public:
     enum seek : int {
@@ -216,64 +284,63 @@ namespace ojph {
   };
 
   ////////////////////////////////////////////////////////////////////////////
-  class j2c_infile : public infile_base
+  class OJPH_EXPORT j2c_infile : public infile_base
   {
   public:
-    OJPH_EXPORT
     j2c_infile() { fh = 0; }
-    OJPH_EXPORT
-    ~j2c_infile() { if (fh) fclose(fh); }
+    ~j2c_infile() override { if (fh) fclose(fh); }
 
-    OJPH_EXPORT
     void open(const char *filename);
 
     //read reads size bytes, returns the number of bytes read
-    OJPH_EXPORT
-    virtual size_t read(void *ptr, size_t size);
+    size_t read(void *ptr, size_t size) override;
     //seek returns 0 on success
-    OJPH_EXPORT
-    virtual int seek(si64 offset, enum infile_base::seek origin);
-    OJPH_EXPORT
-    virtual si64 tell();
-    OJPH_EXPORT
-    virtual bool eof() { return feof(fh) != 0; }
-    OJPH_EXPORT
-    virtual void close();
+    int seek(si64 offset, enum infile_base::seek origin) override;
+    si64 tell() override;
+    bool eof() override { return feof(fh) != 0; }
+    void close() override;
 
   private:
     FILE *fh;
-
   };
 
   ////////////////////////////////////////////////////////////////////////////
-  class mem_infile : public infile_base
+  class OJPH_EXPORT mem_infile : public infile_base
   {
   public:
-    OJPH_EXPORT
     mem_infile() { close(); }
-    OJPH_EXPORT
-    ~mem_infile() { }
+    ~mem_infile() override { }
+
+    mem_infile(mem_infile const&) = delete;
+    mem_infile& operator=(mem_infile const&) = delete;
+
+    /**
+     * Move construction leaves the moved-from value in default constructed state
+     * and transfers ownership of the internal state to the moved-to instance.
+     **/
+    mem_infile(mem_infile &&) noexcept;
+    /**
+     * move assignment with the same ownership transfer semantics as
+     * move construction.
+     **/
+    mem_infile& operator=(mem_infile&&) noexcept;
 
-    OJPH_EXPORT
     void open(const ui8* data, size_t size);
 
     //read reads size bytes, returns the number of bytes read
-    OJPH_EXPORT
-    virtual size_t read(void *ptr, size_t size);
+    size_t read(void *ptr, size_t size) override;
     //seek returns 0 on success
-    OJPH_EXPORT
-    virtual int seek(si64 offset, enum infile_base::seek origin);
-    OJPH_EXPORT
-    virtual si64 tell() { return cur_ptr - data; }
-    OJPH_EXPORT
-    virtual bool eof() { return cur_ptr >= data + size; }
-    OJPH_EXPORT
-    virtual void close() { data = cur_ptr = NULL; size = 0; }
+    int seek(si64 offset, enum infile_base::seek origin) override;
+    si64 tell() override { return cur_ptr - data; }
+    bool eof() override { return cur_ptr >= data + size; }
+    void close() override { data = cur_ptr = NULL; size = 0; }
 
   private:
+    // swap the contents of two instances
+    void swap(mem_infile&) noexcept;
+
     const ui8 *data, *cur_ptr;
     size_t size;
-
   };
 
 
diff --git a/src/core/common/ojph_mem.h b/src/core/openjph/ojph_mem.h
similarity index 61%
rename from src/core/common/ojph_mem.h
rename to src/core/openjph/ojph_mem.h
index 712727c0..d9f22b54 100644
--- a/src/core/common/ojph_mem.h
+++ b/src/core/openjph/ojph_mem.h
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -45,17 +45,23 @@
 #include <type_traits>
 
 #include "ojph_arch.h"
+#include "ojph_message.h"
 
 namespace ojph {
 
+  extern "C" {
+    void* ojph_aligned_malloc(size_t alignment, size_t size);
+    void ojph_aligned_free(void* pointer);
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   class mem_fixed_allocator
   {
   public:
     mem_fixed_allocator()
     {
-      avail_obj = avail_data = store = NULL;
-      avail_size_obj = avail_size_data = size_obj = size_data = 0;
+      store = NULL; allocated_data = 0;
+      restart();
     }
     ~mem_fixed_allocator()
     {
@@ -76,13 +82,30 @@ namespace ojph {
 
     void alloc()
     {
-      assert(store == NULL);
-      avail_obj = store = malloc(size_data + size_obj);
+      assert(preallocation);
+      if (size_data + size_obj > allocated_data)
+      {
+        // We should be here once only, because, in subsequent, calls we
+        // should have size_data + size_obj <= allocated_data
+        free(store);
+        allocated_data = size_data + size_obj;
+        allocated_data = allocated_data + (allocated_data + 19) / 20; // 5%
+        store = malloc(allocated_data);
+        if (store == NULL)
+          OJPH_ERROR(0x00090001, "malloc failed");
+      }
+      avail_obj = store;
       avail_data = (ui8*)store + size_obj;
-      if (store == NULL)
-        throw "malloc failed";
       avail_size_obj = size_obj;
       avail_size_data = size_data;
+      preallocation = false;
+    }
+
+    void restart()
+    {
+      avail_obj = avail_data = NULL;
+      avail_size_obj = avail_size_data = size_obj = size_data = 0;
+      preallocation = true;
     }
 
     template<typename T>
@@ -103,7 +126,7 @@ namespace ojph {
     template<typename T, int N>
     void pre_alloc_local(size_t num_ele, ui32 pre_size, size_t& sz)
     {
-      assert(store == NULL);
+      assert(preallocation);
       num_ele = calc_aligned_size<T, N>(num_ele);
       size_t total = (num_ele + pre_size) * sizeof(T);
       total += 2*N - 1;
@@ -115,7 +138,7 @@ namespace ojph {
     T* post_alloc_local(size_t num_ele, ui32 pre_size,
                         size_t& avail_sz, void*& avail_p)
     {
-      assert(store != NULL);
+      assert(!preallocation);
       num_ele = calc_aligned_size<T, N>(num_ele);
       size_t total = (num_ele + pre_size) * sizeof(T);
       total += 2*N - 1;
@@ -129,34 +152,51 @@ namespace ojph {
 
     void *store, *avail_data, *avail_obj;
     size_t size_data, size_obj, avail_size_obj, avail_size_data;
+    size_t allocated_data;
+    bool preallocation;
   };
 
   /////////////////////////////////////////////////////////////////////////////
-  struct line_buf
+  class line_buf
   {
-    template<typename T>
-    void pre_alloc(mem_fixed_allocator *p, size_t num_ele, ui32 pre_size)
-    {
-      memset(this, 0, sizeof(line_buf));
-      p->pre_alloc_data<T>(num_ele, pre_size);
-      size = num_ele;
-      this->pre_size = pre_size;
-    }
-    
-    template<typename T>
-    void finalize_alloc(mem_fixed_allocator *p);
+  public:
+    enum : ui32 {
+      LFT_UNDEFINED  = 0x00, // Type is undefined/uninitialized
+                             // These flags reflects data size in bytes
+      LFT_BYTE       = 0x01, // Set when data is 1 byte  (not used)
+      LFT_16BIT      = 0x02, // Set when data is 2 bytes (not used)
+      LFT_32BIT      = 0x04, // Set when data is 4 bytes
+      LFT_64BIT      = 0x08, // Set when data is 8 bytes
+      LFT_INTEGER    = 0x10, // Set when data is an integer, in other words
+                             // 32bit integer, not 32bit float
+      LFT_SIZE_MASK  = 0x0F, // To extract data size
+    };
+
+  public:
+    line_buf() : size(0), pre_size(0), flags(LFT_UNDEFINED), i32(0) {}
 
     template<typename T>
     void wrap(T *buffer, size_t num_ele, ui32 pre_size);
 
     size_t size;
     ui32 pre_size;
+    ui32 flags;
     union {
-      si32* i32;
-      float* f32;
+      si32* i32;  // 32bit integer type, used for lossless compression
+      si64* i64;  // 64bit integer type, used for lossless compression
+      float* f32; // float type, used for lossy compression
+      void* p;    // no type is associated with the pointer
     };
   };
 
+  /////////////////////////////////////////////////////////////////////////////
+  struct lifting_buf
+  {
+    lifting_buf() { line = NULL;  active = false; }
+    line_buf *line;
+    bool active;
+  };
+
   /////////////////////////////////////////////////////////////////////////////
   struct coded_lists
   {
@@ -183,38 +223,62 @@ namespace ojph {
   public:
     mem_elastic_allocator(ui32 chunk_size)
     : chunk_size(chunk_size)
-    { cur_store = store = NULL; total_allocated = 0; }
+    { cur_store = store = avail = NULL; total_allocated = 0; }
 
     ~mem_elastic_allocator()
     {
-      while (store) {
+      while (store) { // stores in use
         stores_list* t = store->next_store;
         free(store);
         store = t;
       }
+      while (avail) { // available stores
+        stores_list* t = avail->next_store;
+        free(avail);
+        avail = t;
+      }
     }
 
     void get_buffer(ui32 needed_bytes, coded_lists*& p);
+    void restart();
 
   private:
     struct stores_list
     {
+      // Payload (coded_lists + bitstream) must start at a multiple of 16 bytes.
+      // Otherwise coded_lists::buf can be 4 mod 8, which causes misalignment
+      // on 32-bit architectures. So round sizeof(stores_list) to next
+      // multiple of 16.
+      static constexpr ui32 stores_list_size16()
+      {
+        return (ui32) ((sizeof (stores_list) + 15u) & ~15u);
+      }
       stores_list(ui32 available_bytes)
       {
         this->next_store = NULL;
-        this->available = available_bytes;
-        this->data = (ui8*)this + sizeof(stores_list);
+        this->orig_size = this->available = available_bytes;
+        this->orig_data = this->data = (ui8*)this + stores_list_size16();
       }
-      static ui32 eval_store_bytes(ui32 available_bytes) 
+      void restart()
+      {
+        this->next_store = NULL;
+        this->available = this->orig_size;
+        this->data = this->orig_data;
+      }
+      static ui32 eval_store_bytes(ui32 available_bytes)
       { // calculates how many bytes need to be allocated
-        return available_bytes + (ui32)sizeof(stores_list);
+        return available_bytes + stores_list_size16();
       }
       stores_list *next_store;
-      ui32 available;
-      ui8* data;
+      ui8 *orig_data, *data;
+      ui32 orig_size, available;
     };
 
-    stores_list *store, *cur_store;
+    stores_list* allocate(stores_list** list, ui32 extended_bytes);
+
+    stores_list *store;
+    stores_list *cur_store;
+    stores_list *avail;
     size_t total_allocated;
     const ui32 chunk_size;
   };
diff --git a/src/core/openjph/ojph_message.h b/src/core/openjph/ojph_message.h
new file mode 100644
index 00000000..afb402db
--- /dev/null
+++ b/src/core/openjph/ojph_message.h
@@ -0,0 +1,292 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman
+// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_message.h
+// Author: Aous Naman
+// Date: 29 August 2019
+//***************************************************************************/
+
+#ifndef OJPH_MESSAGE_H
+#define OJPH_MESSAGE_H
+
+#include <cstring>
+#include "ojph_arch.h"
+
+namespace ojph {
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief This enum is use to specify the level of severity of message while
+   *        processing markers
+   *
+   */
+  enum OJPH_MSG_LEVEL : int
+  {
+    OJPH_MSG_ALL_MSG = 0,  // uninitialized or print all message
+    OJPH_MSG_INFO = 1,     // info message
+    OJPH_MSG_WARN = 2,     // warning message
+    OJPH_MSG_ERROR = 3,    // error message (the highest severity)
+    OJPH_MSG_NO_MSG = 4,   // no message (higher severity for message printing
+                           // only)
+  };
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   *  @remark
+   *   There are 3 levels of messaging; they are in order of level of
+   *   severity: INFO, WARNING, and ERROR.  ERROR is the most severe and
+   *   code execution must be terminated.
+   *
+   *  @remark
+   *   The library provides two way to customize the reporting associated with
+   *   each messaging level:
+   *   1. Calling set_XXXX_stream; this sets the library's output file stream
+   *      to a user defined stream, such as std_err or a log file; it can
+   *      also be set to NULL to prevent reporting.
+   *   2. Calling configure_XXXX to pass a pointer to an object from a class
+   *      derived from the corresponding message_XXXX class. The derived
+   *      class must override the virtual operator() to perform the desired
+   *      behaviour.  Remember for message_error, the user must throw an
+   *      exception at the end of the implementation of operator().
+   *
+   *   The customization is global, and cannot be separately tailored for
+   *   each decoder's instantiation.
+   */
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief This is the base class from which all messaging levels are derived
+   *
+   *  Importantly it defined the base virtual operator() that must be defined
+   *  in all derived classes.
+   */
+  class OJPH_EXPORT message_base {
+  public:
+    /**
+     * @brief Prints a message and for errors throws an exception.
+     *        All derived classes must override this virtual function.
+     *
+     * @param warn_code Message code (integer) for identifications.
+     * @param file_name The file name where the message originates.
+     * @param line_num  The line number where the message originates.
+     * @param fmt       The format of the message; this is printf format.
+     * @param ...       A variable number of parameters to print.  This is
+     *                  the parameters you would pass to printf.
+     */
+      virtual void operator() (int warn_code, const char* file_name,
+        int line_num, const char *fmt, ...) = 0;
+  };
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief Derived from message_base to handle info messages
+   */
+  class OJPH_EXPORT message_info : public message_base
+  {
+    public:
+      /**
+       * @brief See the base message_base::operator() for details about
+       *        parameters
+       */
+      virtual void operator() (int info_code, const char* file_name,
+        int line_num, const char* fmt, ...);
+  };
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief Replaces the info output file from the default stdout to user
+   *        defined output file.
+   *
+   * @param s A pointer to the desired output file; it can be stdout, stderr,
+   *          a log file, or NULL if no info messages are desired.
+   */
+  OJPH_EXPORT
+    void set_info_stream(FILE* s);
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief This overrides the default behaviour of handling info messages.
+   *
+   * @param info An object derived from message_info to implement the desired
+   *             behaviour.
+   */
+  OJPH_EXPORT
+    void configure_info(message_info* info);
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief Get the info message object, whose operator() member class is
+   *        called for info messages -- See the macros below.
+   *
+   * @return message_info* returns the active message_info object, or an object
+   *         of the message_info-derived class if one was set.  This object
+   *         handles info messages.  This is mainly to be used with the macros
+   *         below.
+   */
+  OJPH_EXPORT
+    message_info* get_info();
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief Derived from message_base to handle warning messages
+   */
+  class OJPH_EXPORT message_warning : public message_base
+  {
+    public:
+      /**
+       * @brief See the base message_base::operator() for details about
+       *        parameters
+       */
+      virtual void operator() (int warn_code, const char* file_name,
+        int line_num, const char* fmt, ...);
+  };
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief Replaces the warning output file from the default stdout to user
+   *        defined output file.
+   *
+   * @param s A pointer to the desired output file; it can be stdout, stderr,
+   *          a log file, or NULL if no warning messages are desired.
+   */
+  OJPH_EXPORT
+    void set_warning_stream(FILE* s);
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief This overrides the default behaviour of handling warning messages.
+   *
+   * @param warn An object derived from message_warning to implement the
+   *             desired behaviour.
+   */
+  OJPH_EXPORT
+    void configure_warning(message_warning* warn);
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief Get the warning message object, whose operator() member class is
+   *        called for warning messages -- See the macros below.
+   *
+   * @return message_warning* returns the active message_warning object, or an
+   *         object of the message_warning-derived class if one was set.  This
+   *         object handles warning messages.  This is mainly to be used with
+   *         the macros below.
+   */
+  OJPH_EXPORT
+    message_warning* get_warning();
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief Derived from message_base to handle error messages
+   */
+  class OJPH_EXPORT message_error : public message_base
+  {
+    public:
+      /**
+       * @brief See the base message_base::operator() for details about
+       *        parameters
+       */
+      virtual void operator() (int warn_code, const char* file_name,
+        int line_num, const char *fmt, ...);
+  };
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief Replaces the error output file from the default stderr to user
+   *        defined output file.
+   *
+   * @param s A pointer to the desired output file; it can be stdout, stderr,
+   *          a log file, or NULL if no error messages are desired.
+   */
+  OJPH_EXPORT
+    void set_error_stream(FILE *s);
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief This overrides the default behaviour of handling error messages.
+   *
+   * @param error An object derived from message_error to implement the
+   *              desired behaviour.  Remember, remember to throw an exception
+   *              at the end.
+   */
+  OJPH_EXPORT
+    void configure_error(message_error* error);
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief Get the error message object, whose operator() member class is
+   *        called for error messages -- See the macros below.
+   *
+   * @return message_error* returns the active message_error object, or an
+   *         object of the message_error-derived class if one was set.  This
+   *         object handles error messages.  This is mainly to be used with
+   *         the macros below.
+   */
+  OJPH_EXPORT
+    message_error* get_error();
+
+  //////////////////////////////////////////////////////////////////////////////
+  /**
+   * @brief Sets the minimum severity of the message to be reported.
+   *
+   * @param level is the level of the message severity; values are defined in
+   *              OJPH_MSG_LEVEL.
+   */
+  OJPH_EXPORT
+    void set_message_level(OJPH_MSG_LEVEL level);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+/**
+ * @brief MACROS to remove the directory name from the file name
+ */
+#if (defined OJPH_OS_WINDOWS)
+  #define __OJPHFILE__ \
+    (strrchr(__FILE__, '\\') ? strrchr(__FILE__, '\\') + 1 : __FILE__)
+#else
+  #define __OJPHFILE__ \
+    (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+/**
+ * @brief MACROs to insert file and line number for info, warning, and error
+ */
+#define OJPH_INFO(t, ...) \
+  { ojph::get_info()[0](t, __OJPHFILE__, __LINE__, __VA_ARGS__); }
+#define OJPH_WARN(t, ...) \
+  { ojph::get_warning()[0](t, __OJPHFILE__, __LINE__, __VA_ARGS__); }
+#define OJPH_ERROR(t, ...) \
+  { ojph::get_error()[0](t, __OJPHFILE__, __LINE__,__VA_ARGS__); }
+
+
+#endif // !OJPH_MESSAGE_H
diff --git a/src/core/openjph/ojph_params.h b/src/core/openjph/ojph_params.h
new file mode 100644
index 00000000..c55048db
--- /dev/null
+++ b/src/core/openjph/ojph_params.h
@@ -0,0 +1,306 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019, The University of New South Wales, Australia
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_params.h
+// Author: Aous Naman
+// Date: 28 August 2019
+//***************************************************************************/
+
+
+#ifndef OJPH_PARAMS_H
+#define OJPH_PARAMS_H
+
+#include "ojph_arch.h"
+#include "ojph_base.h"
+
+namespace ojph {
+
+  /***************************************************************************/
+  // defined here
+  class param_siz;
+  class param_cod;
+  class param_coc;
+  class param_qcd;
+  class param_cap;
+  class param_nlt;
+  class codestream;
+
+  /***************************************************************************/
+  // prototyping from local
+  namespace local {
+    struct param_siz;
+    struct param_cod;
+    struct param_coc;
+    struct param_qcd;
+    struct param_cap;
+    struct param_nlt;
+    class codestream;
+  }
+
+  /***************************************************************************/
+  class OJPH_EXPORT param_siz
+  {
+  public:
+    param_siz(local::param_siz *p) : state(p) {}
+
+    //setters
+    void set_image_extent(point extent);
+    void set_tile_size(size s);
+    void set_image_offset(point offset);
+    void set_tile_offset(point offset);
+    void set_num_components(ui32 num_comps);
+    void set_component(ui32 comp_num, const point& downsampling,
+                       ui32 bit_depth, bool is_signed);
+
+    //getters
+    point get_image_extent() const;
+    point get_image_offset() const;
+    size get_tile_size() const;
+    point get_tile_offset() const;
+    ui32 get_num_components() const;
+    ui32 get_bit_depth(ui32 comp_num) const;
+    bool is_signed(ui32 comp_num) const;
+    point get_downsampling(ui32 comp_num) const;
+
+    //deeper getters
+    ui32 get_recon_width(ui32 comp_num) const;
+    ui32 get_recon_height(ui32 comp_num) const;
+
+  private:
+    local::param_siz* state;
+  };
+
+  /***************************************************************************/
+  class OJPH_EXPORT param_cod
+  {
+  public:
+    param_cod(local::param_cod* p) : state(p) {}
+
+    void set_num_decomposition(ui32 num_decompositions);
+    void set_block_dims(ui32 width, ui32 height);
+    void set_precinct_size(int num_levels, size* precinct_size);
+    void set_progression_order(const char *name);
+    void set_color_transform(bool color_transform);
+    void set_reversible(bool reversible);
+    param_coc get_coc(ui32 component_idx);
+
+    ui32 get_num_decompositions() const;
+    size get_block_dims() const;
+    size get_log_block_dims() const;
+    bool is_reversible() const;
+    size get_precinct_size(ui32 level_num) const;
+    size get_log_precinct_size(ui32 level_num) const;
+    int get_progression_order() const;
+    const char* get_progression_order_as_string() const;
+    int get_num_layers() const;
+    bool is_using_color_transform() const;
+    bool packets_may_use_sop() const;
+    bool packets_use_eph() const;
+    bool get_block_vertical_causality() const;
+
+  private:
+    local::param_cod* state;
+  };
+
+  /***************************************************************************/
+  class OJPH_EXPORT param_coc
+  {
+  public:
+    param_coc(local::param_cod* p) : state(p) {}
+
+    void set_num_decomposition(ui32 num_decompositions);
+    void set_block_dims(ui32 width, ui32 height);
+    void set_precinct_size(int num_levels, size* precinct_size);
+    void set_reversible(bool reversible);
+
+    ui32 get_num_decompositions() const;
+    size get_block_dims() const;
+    size get_log_block_dims() const;
+    bool is_reversible() const;
+    size get_precinct_size(ui32 level_num) const;
+    size get_log_precinct_size(ui32 level_num) const;
+    bool get_block_vertical_causality() const;
+
+  private:
+    local::param_cod* state;
+  };
+
+  /***************************************************************************/
+  /**
+    * @brief Quantization parameters object
+    * 
+    */
+  class OJPH_EXPORT param_qcd
+  {
+  public:
+    param_qcd(local::param_qcd* p) : state(p) {}
+
+    /**
+     * @brief Set the irreversible quantization base delta.  
+     *  
+     * This represents the default base delta and influences QCD marker 
+     * segment
+     * 
+     * @param delta 
+     */
+    void set_irrev_quant(float delta);
+
+    /**
+     * @brief Set the irreversible quantization base delta for a specific 
+     *        component
+     * 
+     * This represents the default base delta for component comp_idx, and 
+     * influences QCC marker segment for the component, inserting one
+     * if needed, which is usually the case.
+     * 
+     * @param comp_idx 
+     * @param delta 
+     */
+    void set_irrev_quant(ui32 comp_idx, float delta);
+
+  private:
+    local::param_qcd* state;
+  };
+
+  /*************************************************************************/
+  /**
+    * @brief non-linearity point transformation object
+    *        (implements NLT marker segment)
+    * 
+    *  There are a few things to know here.  
+      * The NLT marker segment contains the nonlinearity type and the 
+      * bit depth and signedness of the component to which it applies.
+      * There is the default component ALL_COMPS which applies to all 
+      * components unless it is overridden by another NLT segment marker.
+      * The library checks that the settings make sense, and also make
+      * sure that bit depth and signedness are correct, creating any missing
+      * NLT marker segments in the process.
+      * If all components have the same bit depth and signedness, and need
+      * nonlinearity type 3 (Binary Complement to Sign Magnitude Conversion), 
+      * then the best option is to set ALL_COMPS to type 3.
+      * Otherwise, the best option is to set type 3 only to components that 
+      * need it, leaving out the default ALL_COMPS nonlinearity not set.
+      * Another option is for the end-user can set the ALL_COMPS to type 3, 
+      * and then put exception for the components that does not need type 3, 
+      * by setting them to type 0.
+      * 
+      * The library, during validity check, which is run when the codestream
+      * is created for writing, will do the following:
+      * -- If ALL_COMPS is set to type 0, it will be ignored, and the 
+      * codestream will NOT have the corresponding NLT marker segment.
+      * -- If ALL_COMPS is set to type 3, then the following will happen:
+      *   - If all the components (except those with type 0 set for them) have 
+      *   the same bit depth and signedness, then the ALL_COMPS NLT marker 
+      *   segment will be respected and inserted into the codestream.
+      *   Of course, components with NLT 0 will also have the corresponding
+      *   NLT marker segment inserted.
+      *   - If components, for which no NTL type 0 is specified, have differing
+      *   bit depth or signedness, then the ALL_COMPS will be ignored, and 
+      *   NLT markers are inserted for each component that needs type 3.
+      * Components that have their component field larger than the number of
+      * components in the codestream are removed.
+      * 
+      * It also worth noting that type 3 nonlinearity has no effect on 
+      * positive image samples.  It is also not recommended for integer-valued 
+      * types. It is only recommended for floating-point image samples, for 
+      * which some of the samples are negative, where type 3 nonlinearity 
+      * should be beneficial.  This is because the encoding engine expects 
+      * two-complement representation for negative values while floating point 
+      * numbers have a sign bit followed by an exponent, which has a biased 
+      * integer representation.  The core idea is to make floating-point
+      * representation more compatible with integer representation.
+
+    * 
+    */
+  class OJPH_EXPORT param_nlt
+  {
+  public:
+    enum special_comp_num : ui16 { ALL_COMPS = 65535 };
+    enum nonlinearity : ui8 { 
+      OJPH_NLT_NO_NLT = 0,                // supported
+      OJPH_NLT_GAMMA_STYLE_NLT = 1,       // not supported
+      OJPH_NLT_LUT_STYLE_NLT = 2,         // not supported
+      OJPH_NLT_BINARY_COMPLEMENT_NLT = 3, // supported
+      OJPH_NLT_UNDEFINED = 255          // This is used internally and is 
+                                          // not part of the standard 
+    };
+  public:
+    param_nlt(local::param_nlt* p) : state(p) {}
+
+    /**
+      * @brief enables or disables type 3 nonlinearity for a component 
+      *        or the default setting
+      * 
+      * When creating a codestream for writing, call this function before
+      * you call codestream::write_headers.
+      * 
+      * 
+      * @param comp_num: component number, or 65535 for the default setting
+      * @param type: desired non-linearity from enum nonlinearity
+      */
+    void set_nonlinear_transform(ui32 comp_num, ui8 nl_type);
+
+    /**
+      * @brief get the nonlinearity type associated with comp_num, which 
+      *        should be one from enum nonlinearity
+      *
+      * @param comp_num: component number, or 65535 for the default setting
+      * @param bit_depth: returns the bit depth of the component/default
+      * @param is_signed: returns true if the component/default is signed
+      * @param type: nonlinearity type
+      * @return true if the nonlinearity for comp_num is set
+      */
+    bool get_nonlinear_transform(ui32 comp_num, ui8& bit_depth, 
+                                 bool& is_signed, ui8& nl_type) const;
+
+  private:
+    local::param_nlt* state;
+  };
+
+  /***************************************************************************/
+  class OJPH_EXPORT comment_exchange
+  {
+    friend class local::codestream;
+  public:
+    comment_exchange() : data(NULL), len(0), Rcom(0) {}
+    void set_string(const char* str);
+    void set_data(const char* data, ui16 len);
+
+  private:
+    const char* data;
+    ui16 len;
+    ui16 Rcom;
+  };
+
+}
+
+#endif // !OJPH_PARAMS_H
diff --git a/src/core/common/ojph_version.h b/src/core/openjph/ojph_version.h
similarity index 93%
rename from src/core/common/ojph_version.h
rename to src/core/openjph/ojph_version.h
index 09812863..69aa12b9 100644
--- a/src/core/common/ojph_version.h
+++ b/src/core/openjph/ojph_version.h
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -34,5 +34,5 @@
 //***************************************************************************/
 
 #define OPENJPH_VERSION_MAJOR 0
-#define OPENJPH_VERSION_MINOR 10
-#define OPENJPH_VERSION_PATCH beta0
+#define OPENJPH_VERSION_MINOR 27
+#define OPENJPH_VERSION_PATCH 0
diff --git a/src/core/others/ojph_arch.cpp b/src/core/others/ojph_arch.cpp
index 3c3c2abd..b44aa1cd 100644
--- a/src/core/others/ojph_arch.cpp
+++ b/src/core/others/ojph_arch.cpp
@@ -41,7 +41,9 @@
 
 namespace ojph {
 
-#ifndef OJPH_DISABLE_INTEL_SIMD
+#ifndef OJPH_DISABLE_SIMD
+
+  #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
 
   ////////////////////////////////////////////////////////////////////////////
   // This snippet is borrowed from Intel; see for example
@@ -157,6 +159,80 @@ namespace ojph {
     }
     return true;
   }
+  #elif defined(OJPH_ARCH_ARM)
+
+    #if !defined(OJPH_OS_LINUX) && !defined(OJPH_OS_FREEBSD) && !defined(OJPH_OS_OPENBSD) // Windows/Apple/Android
+
+    bool init_cpu_ext_level(int& level) {
+      level = ARM_CPU_EXT_LEVEL_ASIMD;
+      return true;
+    }
+
+    #else  // Linux/FreeBSD/OpenBSD
+
+      #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) // 64-bit ARM
+
+        #include <sys/auxv.h>
+        #ifdef OJPH_OS_LINUX
+          #include <asm/hwcap.h>
+        #endif
+
+        bool init_cpu_ext_level(int& level) {
+          #ifdef OJPH_OS_LINUX
+            unsigned long hwcaps = getauxval(AT_HWCAP);
+            unsigned long hwcaps2 = getauxval(AT_HWCAP2);
+          #else
+            unsigned long hwcaps = 0;
+            unsigned long hwcaps2 = 0;
+            elf_aux_info(AT_HWCAP, &hwcaps, sizeof(hwcaps));
+            elf_aux_info(AT_HWCAP2, &hwcaps2, sizeof(hwcaps2));
+          #endif
+
+          level = ARM_CPU_EXT_LEVEL_GENERIC;
+          if (hwcaps & HWCAP_ASIMD) {
+            level = ARM_CPU_EXT_LEVEL_ASIMD;
+            if (hwcaps & HWCAP_SVE) {
+              level = ARM_CPU_EXT_LEVEL_SVE;
+              if (hwcaps2 & HWCAP2_SVE2)
+                level = ARM_CPU_EXT_LEVEL_SVE2;
+            }
+          }
+          return true;          
+        }
+
+      #else // 32-bit ARM
+
+        #include <sys/auxv.h>
+        #ifdef OJPH_OS_LINUX
+          #include <asm/hwcap.h>
+        #endif
+
+        bool init_cpu_ext_level(int& level) {
+          #ifdef OJPH_OS_LINUX
+            unsigned long hwcaps = getauxval(AT_HWCAP);
+          #else
+            unsigned long hwcaps = 0;
+            elf_aux_info(AT_HWCAP, &hwcaps, sizeof(hwcaps));
+          #endif
+          level = ARM_CPU_EXT_LEVEL_GENERIC;
+          if (hwcaps & HWCAP_NEON)
+            level = ARM_CPU_EXT_LEVEL_NEON;
+          return true;
+        }
+
+      #endif // end of 64-bit ARM
+
+    #endif
+
+  #else // architectures other than Intel/AMD and ARM
+
+  ////////////////////////////////////////////////////////////////////////////
+  bool init_cpu_ext_level(int& level) {
+    level = 0;
+    return true;
+  }
+
+  #endif // !OJPH_DISABLE_SIMD
 
 #elif defined(OJPH_ENABLE_WASM_SIMD) && defined(OJPH_EMSCRIPTEN)
 
diff --git a/src/core/others/ojph_file.cpp b/src/core/others/ojph_file.cpp
index c454558b..e3456a4e 100644
--- a/src/core/others/ojph_file.cpp
+++ b/src/core/others/ojph_file.cpp
@@ -1,22 +1,22 @@
-//***************************************************************************/
+//***************************************************************************;
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -42,7 +42,9 @@
 
 #include <cassert>
 #include <cstddef>
+#include <utility>
 
+#include "ojph_mem.h"
 #include "ojph_file.h"
 #include "ojph_message.h"
 
@@ -94,53 +96,108 @@ namespace ojph {
     fh = NULL;
   }
 
-  //*************************************************************************/
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  //
   // mem_outfile
-  //*************************************************************************/
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
 
-  /**  */
+  ////////////////////////////////////////////////////////////////////////////
   mem_outfile::mem_outfile()
   {
-    is_open = false;
-    buf_size = 0;
-    buf = cur_ptr = NULL;
+    is_open = clear_mem = false;
+    buf_size = used_size = 0;
+    buf = cur_ptr = nullptr;
   }
 
-  /**  */
+  ////////////////////////////////////////////////////////////////////////////
+  void mem_outfile::swap(mem_outfile& other) noexcept {
+    std::swap(this->is_open,other.is_open);
+    std::swap(this->clear_mem,other.clear_mem);
+    std::swap(this->buf_size,other.buf_size);
+    std::swap(this->used_size,other.used_size);
+    std::swap(this->buf,other.buf);
+    std::swap(this->cur_ptr,other.cur_ptr);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
   mem_outfile::~mem_outfile()
   {
-    close();
+    if (buf)
+      ojph_aligned_free(buf);
+    is_open = clear_mem = false;
+    buf_size = used_size = 0;
+    buf = cur_ptr = NULL;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  mem_outfile::mem_outfile(mem_outfile&& rhs) noexcept: mem_outfile()
+  {
+    this->swap(rhs);
   }
 
-  /**  */
-  void mem_outfile::open(size_t initial_size /* = 65536 */)
+  ////////////////////////////////////////////////////////////////////////////
+  mem_outfile& mem_outfile::operator=(mem_outfile&& rhs) noexcept
+  {
+    if (this != &rhs) {
+      mem_outfile tmp(std::move(rhs));
+      this->swap(tmp);
+    }
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  void mem_outfile::open(size_t initial_size, bool clear_mem)
   {
     assert(this->is_open == false);
-    assert(this->buf_size == 0);
-    assert(this->buf == NULL);
-    assert(this->cur_ptr == NULL);
+    assert(this->cur_ptr == this->buf);
 
-    // do initial buffer allocation
+    // do initial buffer allocation or buffer expansion
     this->is_open = true;
-    this->buf_size = initial_size;
-    if (initial_size)
-      this->buf = (ui8*)malloc(this->buf_size);
+    this->clear_mem = clear_mem;
+    expand_storage(initial_size, this->clear_mem);
+    this->used_size = 0;
     this->cur_ptr = this->buf;
   }
 
-  /**  */
+  ////////////////////////////////////////////////////////////////////////////
   void mem_outfile::close() {
-    if (buf)
-      free(buf);
     is_open = false;
-    buf_size = 0;
-    buf = cur_ptr = NULL;
+    cur_ptr = buf;
   }
 
-  /** The function starts with a buffer size of 65536.  Then, whenever the
-   *  need arises, this buffer is expanded by a factor approx 1.5x
+  ////////////////////////////////////////////////////////////////////////////
+  /** The seek function expands the buffer whenever offset goes beyond
+   *  the buffer end
    */
-  size_t mem_outfile::write(const void *ptr, size_t size)
+  int mem_outfile::seek(si64 offset, enum outfile_base::seek origin)
+  {
+    if (origin == OJPH_SEEK_SET)
+      ; // do nothing
+    else if (origin == OJPH_SEEK_CUR)
+      offset += tell();
+    else if (origin == OJPH_SEEK_END)
+      offset += (si64)used_size;
+    else {
+      assert(0);
+      return -1;
+    }
+
+    if (offset < 0)  // offset before the start of file
+      return -1;
+
+    expand_storage((size_t)offset, false); // See if expansion is needed
+
+    cur_ptr = buf + offset;
+    return 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  /** Whenever the need arises, the buffer is expanded by a factor approx 1.5x
+   */
+  size_t mem_outfile::write(const void *ptr, size_t new_size)
   {
     assert(this->is_open);
     assert(this->buf_size);
@@ -148,24 +205,60 @@ namespace ojph {
     assert(this->cur_ptr);
 
     // expand buffer if needed to make sure it has room for this write
-    si64 used_size = tell(); //current used size
-    size_t new_used_size = (size_t)used_size + size; //needed size
-    if (new_used_size > this->buf_size) //only expand when there is need
+    size_t needed_size = (size_t)tell() + new_size; //needed size
+    expand_storage(needed_size, false);
+
+    // copy bytes into buffer and adjust cur_ptr
+    memcpy(this->cur_ptr, ptr, new_size);
+    cur_ptr += new_size;
+    used_size = ojph_max(used_size, (size_t)tell());
+
+    return new_size;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  void mem_outfile::write_to_file(const char *file_name) const
+  {
+    assert(is_open == false);
+    FILE *f = fopen(file_name, "wb");
+    if (f == NULL)
+      OJPH_ERROR(0x00060003, "failed to open %s for writing", file_name);
+    if (f != NULL)
+      if (fwrite(this->buf, 1, used_size, f) != used_size)
+        OJPH_ERROR(0x00060004, "failed writing to %s", file_name);
+    fclose(f);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  void mem_outfile::expand_storage(size_t needed_size, bool clear_all)
+  {
+    if (needed_size > buf_size)
     {
-      size_t new_buf_size = this->buf_size;
-      while (new_used_size > new_buf_size)
-        new_buf_size += new_buf_size >> 1; //expand by ~1.5x
+      needed_size += (needed_size + 1) >> 1; // x1.5
+      // expand buffer to multiples of (ALIGNED_ALLOC_MASK + 1)
+      needed_size = (needed_size + ALIGNED_ALLOC_MASK) & (~ALIGNED_ALLOC_MASK);
 
-      this->buf = (ui8*)realloc(this->buf, new_buf_size);
-      this->buf_size = new_buf_size;
-      this->cur_ptr = this->buf + used_size;
-    }
+      ui8* new_buf;
+      new_buf = (ui8*)ojph_aligned_malloc(ALIGNED_ALLOC_MASK + 1, needed_size);
+      if (new_buf == NULL)
+        OJPH_ERROR(0x00060005, "failed to allocate memory (%zu bytes)",
+          needed_size);
 
-    // copy bytes into buffer and adjust cur_ptr
-    memcpy(this->cur_ptr, ptr, size);
-    cur_ptr += size;
+      if (this->buf != NULL)
+      {
+        if (!clear_all)
+          memcpy(new_buf, this->buf, used_size);
+        ojph_aligned_free(this->buf);
+      }
+      this->cur_ptr = new_buf + tell();
+      this->buf = new_buf;
 
-    return size;
+      if (clear_mem && !clear_all) // will be cleared later
+        memset(this->buf + buf_size, 0, needed_size - this->buf_size);
+      this->buf_size = needed_size;
+    }
+    if (clear_all)
+      memset(this->buf, 0, this->buf_size);
   }
 
 
@@ -224,6 +317,22 @@ namespace ojph {
   //
   ////////////////////////////////////////////////////////////////////////////
 
+  ////////////////////////////////////////////////////////////////////////////
+  mem_infile::mem_infile(mem_infile&& rhs) noexcept: mem_infile()
+  {
+    this->swap(rhs);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  mem_infile& mem_infile::operator=(mem_infile&& rhs) noexcept
+  {
+    if (this != &rhs) {
+      mem_infile tmp(std::move(rhs));
+      this->swap(tmp);
+    }
+    return *this;
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   void mem_infile::open(const ui8* data, size_t size)
   {
@@ -261,7 +370,7 @@ namespace ojph {
     }
     else if (origin == OJPH_SEEK_CUR)
     {
-      std::ptrdiff_t bytes_off = cur_ptr - data; bytes_off += offset;
+      si64 bytes_off = (si64)(cur_ptr - data) + offset;
       if (bytes_off >= 0 && (size_t)bytes_off <= size)
       {
         cur_ptr = data + bytes_off;
@@ -282,5 +391,12 @@ namespace ojph {
     return result;
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  void mem_infile::swap(mem_infile& other) noexcept
+  {
+    std::swap(this->data,other.data);
+    std::swap(this->cur_ptr,other.cur_ptr);
+    std::swap(this->size,other.size);
+  }
 
 }
diff --git a/src/core/others/ojph_mem.cpp b/src/core/others/ojph_mem.cpp
index b70d51ec..ca7a5616 100644
--- a/src/core/others/ojph_mem.cpp
+++ b/src/core/others/ojph_mem.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -51,36 +51,32 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   template<>
-  void line_buf::finalize_alloc<si32>(mem_fixed_allocator *p)
-  {
-    assert(p != 0 && size != 0);
-    i32 = p->post_alloc_data<si32>(size, pre_size);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////
-  template<>
-  void line_buf::finalize_alloc<float>(mem_fixed_allocator *p)
+  void line_buf::wrap(si32 *buffer, size_t num_ele, ui32 pre_size)
   {
-    assert(p != 0 && size != 0);
-    f32 = p->post_alloc_data<float>(size, pre_size);
+    this->i32 = buffer;
+    this->size = num_ele;
+    this->pre_size = pre_size;
+    this->flags = LFT_32BIT | LFT_INTEGER;
   }
 
   ////////////////////////////////////////////////////////////////////////////
   template<>
-  void line_buf::wrap(si32 *buffer, size_t num_ele, ui32 pre_size)
+  void line_buf::wrap(float *buffer, size_t num_ele, ui32 pre_size)
   {
-    i32 = buffer;
+    this->f32 = buffer;
     this->size = num_ele;
     this->pre_size = pre_size;
+    this->flags = LFT_32BIT;
   }
 
   ////////////////////////////////////////////////////////////////////////////
   template<>
-  void line_buf::wrap(float *buffer, size_t num_ele, ui32 pre_size)
+  void line_buf::wrap(si64 *buffer, size_t num_ele, ui32 pre_size)
   {
-    f32 = buffer;
+    this->i64 = buffer;
     this->size = num_ele;
     this->pre_size = pre_size;
+    this->flags = LFT_64BIT | LFT_INTEGER;
   }
 
   ////////////////////////////////////////////////////////////////////////////
@@ -92,27 +88,39 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
 
   ////////////////////////////////////////////////////////////////////////////
-  void mem_elastic_allocator::get_buffer(ui32 needed_bytes, coded_lists* &p)
+  mem_elastic_allocator::stores_list*
+  mem_elastic_allocator::allocate(mem_elastic_allocator::stores_list** list,
+                                  ui32 extended_bytes)
   {
-    ui32 extended_bytes = needed_bytes + (ui32)sizeof(coded_lists);
-
-    if (store == NULL)
+    ui32 bytes = ojph_max(extended_bytes, chunk_size);
+    if (avail != NULL && avail->orig_size >= bytes)
     {
-      ui32 bytes = ojph_max(extended_bytes, chunk_size);
-      ui32 store_bytes = stores_list::eval_store_bytes(bytes);
-      store = (stores_list*)malloc(store_bytes);
-      cur_store = store = new (store) stores_list(bytes);
-      total_allocated += store_bytes;
+      *list = avail;
+      avail = avail->next_store;
+      (*list)->restart();
+      return *list;
     }
-
-    if (cur_store->available < extended_bytes)
+    else
     {
-      ui32 bytes = ojph_max(extended_bytes, chunk_size);
       ui32 store_bytes = stores_list::eval_store_bytes(bytes);
-      cur_store->next_store = (stores_list*)malloc(store_bytes);
-      cur_store = new (cur_store->next_store) stores_list(bytes);
+      *list = (stores_list*) malloc(store_bytes);
       total_allocated += store_bytes;
+      return new (*list) stores_list(bytes);
     }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  void mem_elastic_allocator::get_buffer(ui32 needed_bytes, coded_lists* &p)
+  {
+    // Round up so each coded_lists (and coded_lists::buf) stays 16-byte aligned
+    // within the store; avoids alignment fault on 32-bit architectures
+    ui32 raw = needed_bytes + (ui32)sizeof (coded_lists);
+    ui32 extended_bytes = (raw + 15u) & ~15u;
+
+    if (store == NULL)
+      cur_store = store = allocate(&store, extended_bytes);
+    else if (cur_store->available < extended_bytes)
+      cur_store = allocate(&cur_store->next_store, extended_bytes);
 
     p = new (cur_store->data) coded_lists(needed_bytes);
 
@@ -121,4 +129,15 @@ namespace ojph {
     cur_store->data += extended_bytes;
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  void mem_elastic_allocator::restart()
+  {
+    // move to the end of avail
+    stores_list** p = &avail;
+    while (*p != NULL)
+      p = &((*p)->next_store);
+    *p = store;
+    cur_store = store = NULL;
+  }
+
 }
diff --git a/src/core/others/ojph_mem_c.c b/src/core/others/ojph_mem_c.c
new file mode 100644
index 00000000..8c66d242
--- /dev/null
+++ b/src/core/others/ojph_mem_c.c
@@ -0,0 +1,129 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2025, Aous Naman
+// Copyright (c) 2025, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2025, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_mem_c.c
+// Author: Aous Naman
+// Date: 17 October 2025
+//***************************************************************************/
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// OS detection definitions for C only
+////////////////////////////////////////////////////////////////////////////////
+#if (defined WIN32) || (defined _WIN32) || (defined _WIN64)
+#define OJPH_OS_WINDOWS
+#elif (defined __APPLE__)
+#define OJPH_OS_APPLE
+#elif (defined __ANDROID__)
+#define OJPH_OS_ANDROID
+#elif (defined __linux)
+#define OJPH_OS_LINUX
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Defines for dll in C only
+////////////////////////////////////////////////////////////////////////////////
+#if defined(OJPH_OS_WINDOWS) && defined(OJPH_BUILD_SHARED_LIBRARY)
+#define OJPH_EXPORT __declspec(dllexport)
+#else
+#define OJPH_EXPORT
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+#ifdef OJPH_OS_WINDOWS
+  OJPH_EXPORT void* ojph_aligned_malloc(size_t alignment, size_t size)
+  {
+    assert(alignment != 0 && (alignment & (alignment - 1)) == 0);
+    return _aligned_malloc(size, alignment);
+  }
+
+  OJPH_EXPORT void ojph_aligned_free(void* pointer)
+  {
+    _aligned_free(pointer);
+  }
+#elif (defined OJPH_ALIGNED_ALLOC_EXISTS)
+  void* ojph_aligned_malloc(size_t alignment, size_t size)
+  {
+    assert(alignment != 0 && (alignment & (alignment - 1)) == 0);
+    return aligned_alloc(alignment, size);
+  }
+
+  void ojph_aligned_free(void* pointer)
+  {
+    free(pointer);
+  }
+#elif (defined OJPH_POSIX_MEMALIGN_EXISTS)
+  void* ojph_aligned_malloc(size_t alignment, size_t size)
+  {
+    assert(alignment != 0 && (alignment & (alignment - 1)) == 0);
+    void *p = NULL;
+    int e = posix_memalign(&p, alignment, size);
+    return (e ? NULL : p);
+  }
+
+  void ojph_aligned_free(void* pointer)
+  {
+    free(pointer);
+  }
+#else
+  void* ojph_aligned_malloc(size_t alignment, size_t size)
+  {
+    assert(alignment != 0 && (alignment & (alignment - 1)) == 0);
+
+    // emulate aligned_alloc
+    void* orig_ptr = malloc(size + alignment + sizeof(void*));
+    if (orig_ptr == NULL)
+      return NULL; // Allocation failed
+
+    uintptr_t start_of_mem = (uintptr_t)orig_ptr + sizeof(void*);
+    uintptr_t aligned_addr = (start_of_mem + alignment - 1) & ~(alignment - 1);
+
+    void** ptr_to_orig_ptr = (void**)aligned_addr;
+    ptr_to_orig_ptr[-1] = orig_ptr;
+
+    return (void*)aligned_addr;
+  }
+
+  void ojph_aligned_free(void* pointer)
+  {
+    if (pointer) {
+      // Retrieve the original pointer stored just before aligned pointer
+      void** ptr_to_orig_ptr = (void**)pointer;
+      void* orig_ptr = ptr_to_orig_ptr[-1];
+
+      free(orig_ptr);
+    }
+  }
+#endif
diff --git a/src/core/others/ojph_message.cpp b/src/core/others/ojph_message.cpp
index d703b6ad..653f680a 100644
--- a/src/core/others/ojph_message.cpp
+++ b/src/core/others/ojph_message.cpp
@@ -35,6 +35,7 @@
 // Date: 29 August 2019
 //***************************************************************************/
 
+#include <cassert>
 #include <cstdio>
 #include <cstdarg>
 #include <stdexcept>
@@ -48,16 +49,17 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   message_info info;
-  message_info& local_info = info;
+  message_info* local_info = &info;
+  OJPH_MSG_LEVEL message_level = OJPH_MSG_ALL_MSG;
 
   ////////////////////////////////////////////////////////////////////////////
   void configure_info(message_info* info)
   {
-    local_info = *info;
+    local_info = info;
   }
 
   ////////////////////////////////////////////////////////////////////////////
-  message_info& get_info()
+  message_info* get_info()
   {
     return local_info;
   }
@@ -72,6 +74,9 @@ namespace ojph {
   void message_info::operator()(int info_code, const char* file_name,
     int line_num, const char* fmt, ...)
   {
+    if (info_stream == NULL || message_level > OJPH_MSG_INFO)
+      return;
+
     fprintf(info_stream, "ojph info 0x%08X at %s:%d: ",
       info_code, file_name, line_num);
     va_list args;
@@ -86,16 +91,16 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   message_warning warn;
-  message_warning& local_warn = warn;
+  message_warning* local_warn = &warn;
 
   ////////////////////////////////////////////////////////////////////////////
   void configure_warning(message_warning* warn)
   {
-    local_warn = *warn;
+    local_warn = warn;
   }
 
   ////////////////////////////////////////////////////////////////////////////
-  message_warning& get_warning()
+  message_warning* get_warning()
   {
     return local_warn;
   }
@@ -110,6 +115,9 @@ namespace ojph {
   void message_warning::operator()(int warn_code, const char* file_name,
     int line_num, const char *fmt, ...)
   {
+    if (warning_stream == NULL || message_level > OJPH_MSG_WARN)
+      return;
+
     fprintf(warning_stream, "ojph warning 0x%08X at %s:%d: ",
       warn_code, file_name, line_num);
     va_list args;
@@ -124,16 +132,16 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   message_error error;
-  message_error& local_error = error;
+  message_error* local_error = &error;
 
   ////////////////////////////////////////////////////////////////////////////
   void configure_error(message_error* error)
   {
-    local_error = *error;
+    local_error = error;
   }
 
   ////////////////////////////////////////////////////////////////////////////
-  message_error& get_error()
+  message_error* get_error()
   {
     return local_error;
   }
@@ -148,15 +156,26 @@ namespace ojph {
   void message_error::operator()(int error_code, const char* file_name,
     int line_num, const char *fmt, ...)
   {
-    fprintf(error_stream, "ojph error 0x%08X at %s:%d: ",
-      error_code, file_name, line_num);
-    va_list args;
-    va_start(args, fmt);
-    vfprintf(error_stream, fmt, args);
-    fprintf(error_stream, "\n");
-    va_end(args);
+    if (error_stream != NULL && message_level <= OJPH_MSG_ERROR)
+    {
+      fprintf(error_stream, "ojph error 0x%08X at %s:%d: ",
+        error_code, file_name, line_num);
+      va_list args;
+      va_start(args, fmt);
+      vfprintf(error_stream, fmt, args);
+      fprintf(error_stream, "\n");
+      va_end(args);
+    }
 
     throw std::runtime_error("ojph error");
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  void set_message_level(OJPH_MSG_LEVEL level)
+  {
+    assert(level >= OJPH_MSG_ALL_MSG &&
+           level <= OJPH_MSG_NO_MSG);
+    message_level = level;
+  }
+
 }
diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index 6b5c0c8c..ef147ade 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -36,129 +36,166 @@
 //***************************************************************************/
 
 #include <cmath>
+#include <climits>
+#include <mutex>
 
 #include "ojph_defs.h"
 #include "ojph_arch.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 #include "ojph_colour_local.h"
 
 namespace ojph {
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_si32_to_si32_shftd)
-      (const si32 *sp, si32 *dp, int shift, ui32 width) = NULL;
-
-    ////////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_si32_to_float_shftd)
-      (const si32 *sp, float *dp, float mul, ui32 width) = NULL;
-
-    ////////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_si32_to_float)
-      (const si32 *sp, float *dp, float mul, ui32 width) = NULL;
-      
-    ////////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_float_to_si32_shftd)
-      (const float *sp, si32 *dp, float mul, ui32 width) = NULL;
-
-    ////////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_float_to_si32)
-      (const float *sp, si32 *dp, float mul, ui32 width) = NULL;
-
-    ////////////////////////////////////////////////////////////////////////////
+    void (*rev_convert)
+      (const line_buf *src_line, const ui32 src_line_offset,
+       line_buf *dst_line, const ui32 dst_line_offset,
+       si64 shift, ui32 width) = NULL;
+
+    //////////////////////////////////////////////////////////////////////////
+    void (*rev_convert_nlt_type3)
+      (const line_buf *src_line, const ui32 src_line_offset,
+       line_buf *dst_line, const ui32 dst_line_offset,
+       si64 shift, ui32 width) = NULL;
+
+
+    //////////////////////////////////////////////////////////////////////////
+    void (*irv_convert_to_integer) (
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width) = NULL;
+
+    //////////////////////////////////////////////////////////////////////////
+    void (*irv_convert_to_float) (
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL;
+
+    //////////////////////////////////////////////////////////////////////////
+    void (*irv_convert_to_integer_nlt_type3) (
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width) = NULL;
+
+    //////////////////////////////////////////////////////////////////////////
+    void (*irv_convert_to_float_nlt_type3) (
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL;
+
+    //////////////////////////////////////////////////////////////////////////
     void (*rct_forward)
-      (const si32 *r, const si32 *g, const si32 *b,
-       si32 *y, si32 *cb, si32 *cr, ui32 repeat) = NULL;
+      (const line_buf* r, const line_buf* g, const line_buf* b,
+       line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*rct_backward)
-      (const si32 *y, const si32 *cb, const si32 *cr,
-       si32 *r, si32 *g, si32 *b, ui32 repeat) = NULL;
+      (const line_buf* r, const line_buf* g, const line_buf* b,
+       line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*ict_forward)
       (const float *r, const float *g, const float *b,
        float *y, float *cb, float *cr, ui32 repeat) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*ict_backward)
       (const float *y, const float *cb, const float *cr,
        float *r, float *g, float *b, ui32 repeat) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
-    static bool colour_transform_functions_initialized = false;
-
     //////////////////////////////////////////////////////////////////////////
     void init_colour_transform_functions()
     {
-      if (colour_transform_functions_initialized)
-        return;
-
+      static std::once_flag colour_transform_functions_init_flag;
+      std::call_once(colour_transform_functions_init_flag, []() {
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
-      cnvrt_si32_to_si32_shftd = gen_cnvrt_si32_to_si32_shftd;
-      cnvrt_si32_to_float_shftd = gen_cnvrt_si32_to_float_shftd;
-      cnvrt_si32_to_float = gen_cnvrt_si32_to_float;
-      cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd;
-      cnvrt_float_to_si32 = gen_cnvrt_float_to_si32;
-      rct_forward = gen_rct_forward;
-      rct_backward = gen_rct_backward;
-      ict_forward = gen_ict_forward;
-      ict_backward = gen_ict_backward;
+        rev_convert = gen_rev_convert;
+        rev_convert_nlt_type3 = gen_rev_convert_nlt_type3;
+        irv_convert_to_integer = gen_irv_convert_to_integer;
+        irv_convert_to_float = gen_irv_convert_to_float;
+        irv_convert_to_integer_nlt_type3 = gen_irv_convert_to_integer_nlt_type3;
+        irv_convert_to_float_nlt_type3 = gen_irv_convert_to_float_nlt_type3;
+        rct_forward = gen_rct_forward;
+        rct_backward = gen_rct_backward;
+        ict_forward = gen_ict_forward;
+        ict_backward = gen_ict_backward;
+
+  #ifndef OJPH_DISABLE_SIMD
+
+    #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
+
+      #ifndef OJPH_DISABLE_SSE
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE)
+        {
+          ict_forward = sse_ict_forward;
+          ict_backward = sse_ict_backward;
+        }
+      #endif // !OJPH_DISABLE_SSE
+
+      #ifndef OJPH_DISABLE_SSE2
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2)
+        {
+          rev_convert = sse2_rev_convert;
+          rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3;
+          irv_convert_to_integer = sse2_irv_convert_to_integer;
+          irv_convert_to_float = sse2_irv_convert_to_float;
+          irv_convert_to_integer_nlt_type3 =
+            sse2_irv_convert_to_integer_nlt_type3;
+          irv_convert_to_float_nlt_type3 =
+            sse2_irv_convert_to_float_nlt_type3;
+          rct_forward = sse2_rct_forward;
+          rct_backward = sse2_rct_backward;
+        }
+      #endif // !OJPH_DISABLE_SSE2
+
+      #ifndef OJPH_DISABLE_AVX
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX)
+        {
+          ict_forward = avx_ict_forward;
+          ict_backward = avx_ict_backward;
+        }
+      #endif // !OJPH_DISABLE_AVX
+
+      #ifndef OJPH_DISABLE_AVX2
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2)
+        {
+          rev_convert = avx2_rev_convert;
+          rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3;
+          irv_convert_to_integer = avx2_irv_convert_to_integer;
+          irv_convert_to_float = avx2_irv_convert_to_float;
+          irv_convert_to_integer_nlt_type3 =
+            avx2_irv_convert_to_integer_nlt_type3;
+          irv_convert_to_float_nlt_type3 =
+            avx2_irv_convert_to_float_nlt_type3;
+          rct_forward = avx2_rct_forward;
+          rct_backward = avx2_rct_backward;
+        }
+      #endif // !OJPH_DISABLE_AVX2
+
+    #elif defined(OJPH_ARCH_ARM)
+
+    #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
+
+  #endif // !OJPH_DISABLE_SIMD
 
-#ifndef OJPH_DISABLE_INTEL_SIMD
-      int level = get_cpu_ext_level();
-
-      if (level >= X86_CPU_EXT_LEVEL_SSE)
-      {
-        cnvrt_si32_to_float_shftd = sse_cnvrt_si32_to_float_shftd;
-        cnvrt_si32_to_float = sse_cnvrt_si32_to_float;
-        cnvrt_float_to_si32_shftd = sse_cnvrt_float_to_si32_shftd;
-        cnvrt_float_to_si32 = sse_cnvrt_float_to_si32;
-        ict_forward = sse_ict_forward;
-        ict_backward = sse_ict_backward;
-      }
-
-      if (level >= X86_CPU_EXT_LEVEL_SSE2)
-      {
-        cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd;
-        cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32;
-        cnvrt_si32_to_si32_shftd = sse2_cnvrt_si32_to_si32_shftd;
-        rct_forward = sse2_rct_forward;
-        rct_backward = sse2_rct_backward;
-      }
+#else // OJPH_ENABLE_WASM_SIMD
 
-      if (level >= X86_CPU_EXT_LEVEL_AVX)
-      {
-        cnvrt_si32_to_float_shftd = avx_cnvrt_si32_to_float_shftd;
-        cnvrt_si32_to_float = avx_cnvrt_si32_to_float;
-        cnvrt_float_to_si32_shftd = avx_cnvrt_float_to_si32_shftd;
-        cnvrt_float_to_si32 = avx_cnvrt_float_to_si32;
-        ict_forward = avx_ict_forward;
-        ict_backward = avx_ict_backward;
-      }
+        rev_convert = wasm_rev_convert;
+        rev_convert_nlt_type3 = wasm_rev_convert_nlt_type3;
+        irv_convert_to_integer = wasm_irv_convert_to_integer;
+        irv_convert_to_float = wasm_irv_convert_to_float;
+        irv_convert_to_integer_nlt_type3 = wasm_irv_convert_to_integer_nlt_type3;
+        irv_convert_to_float_nlt_type3 = wasm_irv_convert_to_float_nlt_type3;
+        rct_forward = wasm_rct_forward;
+        rct_backward = wasm_rct_backward;
+        ict_forward = wasm_ict_forward;
+        ict_backward = wasm_ict_backward;
 
-      if (level >= X86_CPU_EXT_LEVEL_AVX2)
-      {
-        cnvrt_si32_to_si32_shftd = avx2_cnvrt_si32_to_si32_shftd;
-        rct_forward = avx2_rct_forward;
-        rct_backward = avx2_rct_backward;
-      }
-#endif // !OJPH_DISABLE_INTEL_SIMD
-
-#else // OJPH_ENABLE_WASM_SIMD
-      cnvrt_si32_to_si32_shftd = wasm_cnvrt_si32_to_si32_shftd;
-      cnvrt_si32_to_float_shftd = wasm_cnvrt_si32_to_float_shftd;
-      cnvrt_si32_to_float = wasm_cnvrt_si32_to_float;
-      cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd;
-      cnvrt_float_to_si32 = wasm_cnvrt_float_to_si32;
-      rct_forward = wasm_rct_forward;
-      rct_backward = wasm_rct_backward;
-      ict_forward = wasm_ict_forward;
-      ict_backward = wasm_ict_backward;
 #endif // !OJPH_ENABLE_WASM_SIMD
-
-      colour_transform_functions_initialized = true;
+      });
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -179,66 +216,309 @@ namespace ojph {
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                      ui32 width)
+    void gen_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
+      si64 shift, ui32 width)
     {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = *sp++ + shift;
+      if (src_line->flags & line_buf::LFT_32BIT)
+      {
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          si32 s = (si32)shift;
+          for (ui32 i = width; i > 0; --i)
+            *dp++ = *sp++ + s;
+        }
+        else
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          for (ui32 i = width; i > 0; --i)
+            *dp++ = *sp++ + shift;
+        }
+      }
+      else
+      {
+        assert(src_line->flags & line_buf::LFT_64BIT);
+        assert(dst_line->flags & line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        for (ui32 i = width; i > 0; --i)
+          *dp++ = (si32)(*sp++ + shift);
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width)
+    void gen_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
+      si64 shift, ui32 width)
     {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = (float)*sp++ * mul - 0.5f;
+      if (src_line->flags & line_buf::LFT_32BIT)
+      {
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          si32 s = (si32)shift;
+          for (ui32 i = width; i > 0; --i) {
+            const si32 v = *sp++;
+            *dp++ = v >= 0 ? v : (- v - s);
+          }
+        }
+        else
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          for (ui32 i = width; i > 0; --i) {
+            const si64 v = *sp++;
+            *dp++ = v >= 0 ? v : (- v - shift);
+          }
+        }
+      }
+      else
+      {
+        assert(src_line->flags & line_buf::LFT_64BIT);
+        assert(dst_line->flags & line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        for (ui32 i = width; i > 0; --i) {
+          const si64 v = *sp++;
+          *dp++ = (si32)(v >= 0 ? v : (- v - shift));
+        }
+      }
     }
 
+
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width)
+    template<bool NLT_TYPE3>
+    static inline
+    void local_gen_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
     {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = (float)*sp++ * mul;
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER));
+
+      assert(bit_depth <= 32);
+      const float* sp = src_line->f32;
+      si32* dp = dst_line->i32 + dst_line_offset;
+      // There is the possibility that converting to integer will
+      // exceed the dynamic range of 32bit integer; therefore, care must be
+      // exercised.
+      // We look if the floating point number is outside the half-closed
+      // interval [-0.5f, 0.5f). If so, we limit the resulting integer
+      // to the maximum/minimum that number supports.
+      si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
+      float mul = (float)(1ull << bit_depth);
+      float fl_up_lim = -(float)neg_limit; // val < upper
+      float fl_low_lim = (float)neg_limit; // val >= lower
+      si32 s32_up_lim = INT_MAX >> (32 - bit_depth);
+      si32 s32_low_lim = INT_MIN >> (32 - bit_depth);
+
+      if (is_signed)
+      {
+        const si32 bias = (si32)((1ULL << (bit_depth - 1)) + 1);
+        for (int i = (int)width; i > 0; --i) {
+          float t = *sp++ * mul;
+          si32 v = ojph_round(t);
+          v = t >= fl_low_lim ? v : s32_low_lim;
+          v = t <  fl_up_lim  ? v : s32_up_lim;
+          if (NLT_TYPE3)
+            v = (v >= 0) ? v : (- v - bias);
+          *dp++ = v;
+        }
+      }
+      else
+      {
+        const si32 half = (si32)(1ULL << (bit_depth - 1));
+        for (int i = (int)width; i > 0; --i) {
+          float t = *sp++ * mul;
+          si32 v = ojph_round(t);
+          v = t >= fl_low_lim ? v : s32_low_lim;
+          v = t <  fl_up_lim  ? v : s32_up_lim;
+          *dp++ = v + half;
+        }
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width)
+    void gen_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
     {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = ojph_round((*sp++ + 0.5f) * mul);
+      local_gen_irv_convert_to_integer<false>(src_line, dst_line,
+        dst_line_offset, bit_depth, is_signed, width);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width)
+    void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
     {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = ojph_round(*sp++ * mul);
+      local_gen_irv_convert_to_integer<true>(src_line, dst_line,
+        dst_line_offset, bit_depth, is_signed, width);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                         si32 *y, si32 *cb, si32 *cr, ui32 repeat)
+    template<bool NLT_TYPE3>
+    static inline
+    void local_gen_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
     {
-      for (ui32 i = repeat; i > 0; --i)
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
+
+      assert(bit_depth <= 32);
+      float mul = (float)(1.0 / (double)(1ULL << bit_depth));
+
+      const si32* sp = src_line->i32 + src_line_offset;
+      float* dp = dst_line->f32;
+      if (is_signed)
+      {
+        const si32 bias = (si32)((1ULL << (bit_depth - 1)) + 1);
+        for (int i = (int)width; i > 0; --i) {
+          si32 v = *sp++;
+          if (NLT_TYPE3)
+            v = (v >= 0) ? v : (- v - bias);
+          *dp++ = (float)v * mul;
+        }
+      }
+      else
       {
-        *y++ = (*r + (*g << 1) + *b) >> 2;
-        *cb++ = (*b++ - *g);
-        *cr++ = (*r++ - *g++);
+        const si32 half = (si32)(1ULL << (bit_depth - 1));
+        for (int i = (int)width; i > 0; --i) {
+          si32 v = *sp++;
+          v -= half;
+          *dp++ = (float)v * mul;
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                          si32 *r, si32 *g, si32 *b, ui32 repeat)
+    void gen_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
     {
-      for (ui32 i = repeat; i > 0; --i)
+      local_gen_irv_convert_to_float<false>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_gen_irv_convert_to_float<true>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
+    {
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) &&
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) &&
+             (b->flags  & line_buf::LFT_INTEGER));
+
+      if  (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) &&
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si32 rr = *rp++, gg = *gp++, bb = *bp++;
+          *yp++ = (rr + (gg << 1) + bb) >> 2;
+          *cbp++ = (bb - gg);
+          *crp++ = (rr - gg);
+        }
+      }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) &&
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si64 rr = *rp++, gg = *gp++, bb = *bp++;
+          *yp++ = (rr + (gg << 1) + bb) >> 2;
+          *cbp++ = (bb - gg);
+          *crp++ = (rr - gg);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
+    {
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) &&
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) &&
+             (b->flags  & line_buf::LFT_INTEGER));
+
+      if (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) &&
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si32 yy = *yp++, cbb = *cbp++, crr = *crp++;
+          si32 gg = yy - ((cbb + crr) >> 2);
+          *rp++ = crr + gg;
+          *gp++ = gg;
+          *bp++ = cbb + gg;
+        }
+      }
+      else
       {
-        *g = *y++ - ((*cb + *cr)>>2);
-        *b++ = *cb++ + *g;
-        *r++ = *cr++ + *g++;
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) &&
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si64 yy = *yp++, cbb = *cbp++, crr = *crp++;
+          si64 gg = yy - ((cbb + crr) >> 2);
+          *rp++ = (si32)(crr + gg);
+          *gp++ = (si32)gg;
+          *bp++ = (si32)(cbb + gg);
+        }
       }
     }
 
diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h
index 212848b5..b0b5da61 100644
--- a/src/core/transform/ojph_colour.h
+++ b/src/core/transform/ojph_colour.h
@@ -40,40 +40,57 @@
 #define OJPH_COLOR_H
 
 namespace ojph {
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
 
   ////////////////////////////////////////////////////////////////////////////
   void init_colour_transform_functions();
 
   ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_si32_to_si32_shftd)
-    (const si32 *sp, si32 *dp, int shift, ui32 width);
+  extern void (*rev_convert)
+    (const line_buf *src_line, const ui32 src_line_offset, 
+     line_buf *dst_line, const ui32 dst_line_offset, 
+     si64 shift, ui32 width);
+
+  ////////////////////////////////////////////////////////////////////////////
+  extern void (*rev_convert_nlt_type3)
+    (const line_buf *src_line, const ui32 src_line_offset, 
+     line_buf *dst_line, const ui32 dst_line_offset, 
+     si64 shift, ui32 width);
+
 
   ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_si32_to_float_shftd)
-    (const si32 *sp, float *dp, float mul, ui32 width);
+  extern void (*irv_convert_to_integer) (
+    const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+    ui32 bit_depth, bool is_signed, ui32 width);
 
   ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_si32_to_float)
-    (const si32 *sp, float *dp, float mul, ui32 width);
+  extern void (*irv_convert_to_float) (
+    const line_buf *src_line, ui32 src_line_offset,
+    line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
   ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_float_to_si32_shftd)
-    (const float *sp, si32 *dp, float mul, ui32 width);
+  extern void (*irv_convert_to_integer_nlt_type3) (
+    const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+    ui32 bit_depth, bool is_signed, ui32 width);
 
   ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_float_to_si32)
-    (const float *sp, si32 *dp, float mul, ui32 width);
+  extern void (*irv_convert_to_float_nlt_type3) (
+    const line_buf *src_line, ui32 src_line_offset,
+    line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*rct_forward)
-    (const si32 *r, const si32 *g, const si32 *b,
-     si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    (const line_buf *r, const line_buf *g, const line_buf *b,
+     line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*rct_backward)
-    (const si32 *y, const si32 *cb, const si32 *cr,
-     si32 *r, si32 *g, si32 *b, ui32 repeat);
+    (const line_buf *y, const line_buf *cb, const line_buf *cr,
+     line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*ict_forward)
diff --git a/src/core/transform/ojph_colour_avx.cpp b/src/core/transform/ojph_colour_avx.cpp
index 27e78e5c..e03217a6 100644
--- a/src/core/transform/ojph_colour_avx.cpp
+++ b/src/core/transform/ojph_colour_avx.cpp
@@ -35,10 +35,12 @@
 // Date: 11 October 2019
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
 #include <cmath>
 
 #include "ojph_defs.h"
-#include "ojph_arch.h"
 #include "ojph_colour.h"
 #include "ojph_colour_local.h"
 
@@ -47,66 +49,6 @@
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width)
-    {
-      __m256 shift = _mm256_set1_ps(0.5f);
-      __m256 m = _mm256_set1_ps(mul);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
-      {
-        __m256i t = _mm256_loadu_si256((__m256i*)sp);
-        __m256 s = _mm256_cvtepi32_ps(t);
-        s = _mm256_mul_ps(s, m);
-        s = _mm256_sub_ps(s, shift);
-        _mm256_store_ps(dp, s);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width)
-    {
-      __m256 m = _mm256_set1_ps(mul);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
-      {
-        __m256i t = _mm256_loadu_si256((__m256i*)sp);
-        __m256 s = _mm256_cvtepi32_ps(t);
-        s = _mm256_mul_ps(s, m);
-        _mm256_store_ps(dp, s);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width)
-    {
-      __m256 shift = _mm256_set1_ps(0.5f);
-      __m256 m = _mm256_set1_ps(mul);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
-      {
-        __m256 t = _mm256_load_ps(sp);
-        __m256 s = _mm256_add_ps(t, shift);
-        s = _mm256_mul_ps(s, m);
-        s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-        _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s));
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width)
-    {
-      __m256 m = _mm256_set1_ps(mul);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
-      {
-        __m256 t = _mm256_load_ps(sp);
-        __m256 s = _mm256_mul_ps(t, m);
-        s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-        _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s));
-      }
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void avx_ict_forward(const float *r, const float *g, const float *b,
                          float *y, float *cb, float *cr, ui32 repeat)
@@ -157,3 +99,5 @@ namespace ojph {
 
   }
 }
+
+#endif
diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index 60e20d6f..3ec905df 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -35,10 +35,14 @@
 // Date: 11 October 2019
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
+#include <climits>
 #include <cmath>
 
 #include "ojph_defs.h"
-#include "ojph_arch.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 
 #include <immintrin.h>
@@ -46,63 +50,574 @@
 namespace ojph {
   namespace local {
 
+    /////////////////////////////////////////////////////////////////////////
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline
+    __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m)
+    {
+      // note than m must be obtained using
+      // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
+      __m256i x = _mm256_srli_epi64(a, amt);
+      x = _mm256_xor_si256(x, m);
+      __m256i result = _mm256_sub_epi64(x, m);
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_convert(const line_buf *src_line,
+                          const ui32 src_line_offset,
+                          line_buf *dst_line,
+                          const ui32 dst_line_offset,
+                          si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      {
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi32((si32)shift);
+          for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+          {
+            __m256i s = _mm256_loadu_si256((__m256i*)sp);
+            s = _mm256_add_epi32(s, sh);
+            _mm256_storeu_si256((__m256i*)dp, s);
+          }
+        }
+        else
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi64x(shift);
+          for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+          {
+            __m256i s, t;
+            s = _mm256_loadu_si256((__m256i*)sp);
+
+            t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0));
+            t = _mm256_add_epi64(t, sh);
+            _mm256_storeu_si256((__m256i*)dp, t);
+
+            t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1));
+            t = _mm256_add_epi64(t, sh);
+            _mm256_storeu_si256((__m256i*)dp + 1, t);
+          }
+        }
+      }
+      else
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
+                                             0, (si64)ULLONG_MAX);
+        __m256i sh = _mm256_set1_epi64x(shift);
+        for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+        {
+          __m256i s, t;
+          s = _mm256_loadu_si256((__m256i*)sp);
+          s = _mm256_add_epi64(s, sh);
+
+          t = _mm256_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm256_and_si256(low_bits, t);
+
+          s = _mm256_loadu_si256((__m256i*)sp + 1);
+          s = _mm256_add_epi64(s, sh);
+
+          s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
+          s = _mm256_andnot_si256(low_bits, s);
+
+          t = _mm256_or_si256(s, t);
+          t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
+          _mm256_storeu_si256((__m256i*)dp, t);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_convert_nlt_type3(const line_buf *src_line,
+                                    const ui32 src_line_offset,
+                                    line_buf *dst_line,
+                                    const ui32 dst_line_offset,
+                                    si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      {
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi32((si32)(-shift));
+          __m256i zero = _mm256_setzero_si256();
+          for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
+          {
+            __m256i s = _mm256_loadu_si256((__m256i*)sp);
+            __m256i c = _mm256_cmpgt_epi32(zero, s);  // 0xFFFFFFFF for -ve val
+            __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value
+            v_m_sh = _mm256_and_si256(c, v_m_sh);     // keep only -shift-val
+            s = _mm256_andnot_si256(c, s);            // keep only +ve or 0
+            s = _mm256_or_si256(s, v_m_sh);           // combine
+            _mm256_storeu_si256((__m256i*)dp, s);
+          }
+        }
+        else
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi64x(-shift);
+          __m256i zero = _mm256_setzero_si256();
+          for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
+          {
+            __m256i s, t, u0, u1, c, v_m_sh;
+            s = _mm256_loadu_si256((__m256i*)sp);
+
+            t = _mm256_cmpgt_epi32(zero, s);      // find -ve 32bit -1
+            u0 = _mm256_unpacklo_epi32(s, t);     // correct 64bit data
+            c = _mm256_unpacklo_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm256_sub_epi64(sh, u0);    // - shift - value
+            v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
+            u0 = _mm256_andnot_si256(c, u0);      // keep only +ve or 0
+            u0 = _mm256_or_si256(u0, v_m_sh);     // combine
+
+            u1 = _mm256_unpackhi_epi32(s, t);     // correct 64bit data
+            c = _mm256_unpackhi_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm256_sub_epi64(sh, u1);    // - shift - value
+            v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
+            u1 = _mm256_andnot_si256(c, u1);      // keep only +ve or 0
+            u1 = _mm256_or_si256(u1, v_m_sh);     // combine
+
+            t = _mm256_permute2x128_si256(u0, u1, (2 << 4) | 0);
+            _mm256_storeu_si256((__m256i*)dp, t);
+
+            t = _mm256_permute2x128_si256(u0, u1, (3 << 4) | 1);
+            _mm256_storeu_si256((__m256i*)dp + 1, t);
+          }
+        }
+      }
+      else
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m256i sh = _mm256_set1_epi64x(-shift);
+        __m256i zero = _mm256_setzero_si256();
+        __m256i half_mask = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
+                                              0, (si64)ULLONG_MAX);
+        for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
+        {
+          // s for source, t for target, p for positive, n for negative,
+          // m for mask, and tm for temp
+          __m256i s, t, p, n, m, tm;
+          s = _mm256_loadu_si256((__m256i*)sp);
+
+          m = _mm256_cmpgt_epi64(zero, s);    // 64b -1 for -ve value
+          tm = _mm256_sub_epi64(sh, s);       // - shift - value
+          n = _mm256_and_si256(m, tm);        // -ve
+          p = _mm256_andnot_si256(m, s);      // +ve
+          tm = _mm256_or_si256(n, p);
+          tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm256_and_si256(half_mask, tm);
+
+          s = _mm256_loadu_si256((__m256i*)sp + 1);
+          m = _mm256_cmpgt_epi64(zero, s);    // 64b -1 for -ve value
+          tm = _mm256_sub_epi64(sh, s);       // - shift - value
+          n = _mm256_and_si256(m, tm);        // -ve
+          p = _mm256_andnot_si256(m, s);      // +ve
+          tm = _mm256_or_si256(n, p);
+          tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
+          tm = _mm256_andnot_si256(half_mask, tm);
+
+          t = _mm256_or_si256(t, tm);
+          t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
+           _mm256_storeu_si256((__m256i*)dp, t);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    __m256i ojph_mm256_max_ge_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
+    {
+      // We must use _CMP_NLT_UQ or _CMP_GE_OQ, _CMP_GE_OS, or _CMP_NLT_US
+      // It is not clear to me which to use
+      __m256 ct = _mm256_cmp_ps(x, y, _CMP_NLT_UQ); // 0xFFFFFFFF for x >= y
+      __m256i c = _mm256_castps_si256(ct);   // does not generate any code
+      __m256i d = _mm256_and_si256(c, a);    // keep only a, where x >= y
+      __m256i e = _mm256_andnot_si256(c, b); // keep only b, where x <  y
+      return _mm256_or_si256(d, e);          // combine
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    __m256i ojph_mm256_min_lt_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
+    {
+      // We must use _CMP_LT_OQ or _CMP_NGE_UQ, _CMP_LT_OS, or _CMP_NGE_US
+      // It is not clear to me which to use
+      __m256 ct = _mm256_cmp_ps(x, y, _CMP_NGE_UQ); // 0xFFFFFFFF for x < y
+      __m256i c = _mm256_castps_si256(ct);   // does not generate any code
+      __m256i d = _mm256_and_si256(c, a);    // keep only a, where x <  y
+      __m256i e = _mm256_andnot_si256(c, b); // keep only b, where x >= y
+      return _mm256_or_si256(d, e);          // combine
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    template<bool NLT_TYPE3>
+    static inline
+    void local_avx2_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER));
+
+      assert(bit_depth <= 32);
+      const float* sp = src_line->f32;
+      si32* dp = dst_line->i32 + dst_line_offset;
+      // There is the possibility that converting to integer will
+      // exceed the dynamic range of 32bit integer; therefore, care must be
+      // exercised.
+      // We look if the floating point number is outside the half-closed
+      // interval [-0.5f, 0.5f). If so, we limit the resulting integer
+      // to the maximum/minimum that number supports.
+      si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
+      __m256 mul = _mm256_set1_ps((float)(1ull << bit_depth));
+      __m256 fl_up_lim = _mm256_set1_ps(-(float)neg_limit);  // val < upper
+      __m256 fl_low_lim = _mm256_set1_ps((float)neg_limit);  // val >= lower
+      __m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
+      __m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
+
+      if (is_signed)
+      {
+        __m256i zero = _mm256_setzero_si256();
+        __m256i bias = 
+          _mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
+        for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
+          __m256 t = _mm256_loadu_ps(sp);
+          t = _mm256_mul_ps(t, mul);
+          __m256i u = _mm256_cvtps_epi32(t);
+          u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_mm256_min_lt_epi32(u,  s32_up_lim, t,  fl_up_lim);
+          if (NLT_TYPE3)
+          {
+            __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve val
+            __m256i neg = _mm256_sub_epi32(bias, u); // -bias -value
+            neg = _mm256_and_si256(c, neg);          // keep only - bias - val
+            u = _mm256_andnot_si256(c, u);           // keep only +ve or 0
+            u = _mm256_or_si256(neg, u);             // combine
+          }
+          _mm256_storeu_si256((__m256i*)dp, u);
+        }
+      }
+      else
+      {
+        __m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1)));
+        for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
+          __m256 t = _mm256_loadu_ps(sp);
+          t = _mm256_mul_ps(t, mul);
+          __m256i u = _mm256_cvtps_epi32(t);
+          u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_mm256_min_lt_epi32(u,  s32_up_lim, t,  fl_up_lim);
+          u = _mm256_add_epi32(u, half);
+          _mm256_storeu_si256((__m256i*)dp, u);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_avx2_irv_convert_to_integer<false>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_avx2_irv_convert_to_integer<true>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
     //////////////////////////////////////////////////////////////////////////
-    void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width)
+    template<bool NLT_TYPE3>
+    static inline    
+    void local_avx2_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
     {
-      __m256i sh = _mm256_set1_epi32(shift);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
+
+      assert(bit_depth <= 32);
+      __m256 mul = _mm256_set1_ps((float)(1.0 / (double)(1ULL << bit_depth)));
+
+      const si32* sp = src_line->i32 + src_line_offset;
+      float* dp = dst_line->f32;
+      if (is_signed)
       {
-        __m256i s = _mm256_loadu_si256((__m256i*)sp);
-        s = _mm256_add_epi32(s, sh);
-        _mm256_storeu_si256((__m256i*)dp, s);
+        __m256i zero = _mm256_setzero_si256();
+        __m256i bias = 
+          _mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
+        for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
+          __m256i t = _mm256_loadu_si256((__m256i*)sp);
+          if (NLT_TYPE3)
+          {          
+            __m256i c = _mm256_cmpgt_epi32(zero, t); // 0xFFFFFFFF for -ve val
+            __m256i neg = _mm256_sub_epi32(bias, t); // - bias - value
+            neg = _mm256_and_si256(c, neg);          // keep only - bias - val
+            c = _mm256_andnot_si256(c, t);           // keep only +ve or 0
+            t = _mm256_or_si256(neg, c);             // combine
+          }
+          __m256 v = _mm256_cvtepi32_ps(t);
+          v = _mm256_mul_ps(v, mul);
+          _mm256_storeu_ps(dp, v);
+        }
       }
+      else
+      {
+        __m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1)));
+        for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
+          __m256i t = _mm256_loadu_si256((__m256i*)sp);
+          t = _mm256_sub_epi32(t, half);
+          __m256 v = _mm256_cvtepi32_ps(t);
+          v = _mm256_mul_ps(v, mul);
+          _mm256_storeu_ps(dp, v);
+        }
+      }
+    }
+
+        //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_avx2_irv_convert_to_float<false>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_avx2_irv_convert_to_float<true>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
     }
 
+
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat)
+    void avx2_rct_forward(const line_buf *r,
+                          const line_buf *g,
+                          const line_buf *b,
+                          line_buf *y, line_buf *cb, line_buf *cr,
+                          ui32 repeat)
     {
-      for (int i = (repeat + 7) >> 3; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) &&
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) &&
+             (b->flags  & line_buf::LFT_INTEGER));
+
+      if  (y->flags & line_buf::LFT_32BIT)
       {
-        __m256i mr = _mm256_load_si256((__m256i*)r);
-        __m256i mg = _mm256_load_si256((__m256i*)g);
-        __m256i mb = _mm256_load_si256((__m256i*)b);
-        __m256i t = _mm256_add_epi32(mr, mb);
-        t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1));
-        _mm256_store_si256((__m256i*)y, _mm256_srai_epi32(t, 2));
-        t = _mm256_sub_epi32(mb, mg);
-        _mm256_store_si256((__m256i*)cb, t);
-        t = _mm256_sub_epi32(mr, mg);
-        _mm256_store_si256((__m256i*)cr, t);
-
-        r += 8; g += 8; b += 8;
-        y += 8; cb += 8; cr += 8;
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) &&
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i mr = _mm256_load_si256((__m256i*)rp);
+          __m256i mg = _mm256_load_si256((__m256i*)gp);
+          __m256i mb = _mm256_load_si256((__m256i*)bp);
+          __m256i t = _mm256_add_epi32(mr, mb);
+          t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1));
+          _mm256_store_si256((__m256i*)yp, _mm256_srai_epi32(t, 2));
+          t = _mm256_sub_epi32(mb, mg);
+          _mm256_store_si256((__m256i*)cbp, t);
+          t = _mm256_sub_epi32(mr, mg);
+          _mm256_store_si256((__m256i*)crp, t);
+
+          rp += 8; gp += 8; bp += 8;
+          yp += 8; cbp += 8; crp += 8;
+        }
+      }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) &&
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i mr32 = _mm256_load_si256((__m256i*)rp);
+          __m256i mg32 = _mm256_load_si256((__m256i*)gp);
+          __m256i mb32 = _mm256_load_si256((__m256i*)bp);
+          __m256i mr, mg, mb, t;
+          mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0));
+          mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0));
+          mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0));
+
+          t = _mm256_add_epi64(mr, mb);
+          t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
+          _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
+          t = _mm256_sub_epi64(mb, mg);
+          _mm256_store_si256((__m256i*)cbp, t);
+          t = _mm256_sub_epi64(mr, mg);
+          _mm256_store_si256((__m256i*)crp, t);
+
+          yp += 4; cbp += 4; crp += 4;
+
+          mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1));
+          mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1));
+          mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1));
+
+          t = _mm256_add_epi64(mr, mb);
+          t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
+          _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
+          t = _mm256_sub_epi64(mb, mg);
+          _mm256_store_si256((__m256i*)cbp, t);
+          t = _mm256_sub_epi64(mr, mg);
+          _mm256_store_si256((__m256i*)crp, t);
+
+          rp += 8; gp += 8; bp += 8;
+          yp += 4; cbp += 4; crp += 4;
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat)
+    void avx2_rct_backward(const line_buf *y,
+                           const line_buf *cb,
+                           const line_buf *cr,
+                           line_buf *r, line_buf *g, line_buf *b,
+                           ui32 repeat)
     {
-      for (int i = (repeat + 7) >> 3; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) &&
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) &&
+             (b->flags  & line_buf::LFT_INTEGER));
+
+      if (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) &&
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i my  = _mm256_load_si256((__m256i*)yp);
+          __m256i mcb = _mm256_load_si256((__m256i*)cbp);
+          __m256i mcr = _mm256_load_si256((__m256i*)crp);
+
+          __m256i t = _mm256_add_epi32(mcb, mcr);
+          t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2));
+          _mm256_store_si256((__m256i*)gp, t);
+          __m256i u = _mm256_add_epi32(mcb, t);
+          _mm256_store_si256((__m256i*)bp, u);
+          u = _mm256_add_epi32(mcr, t);
+          _mm256_store_si256((__m256i*)rp, u);
+
+          yp += 8; cbp += 8; crp += 8;
+          rp += 8; gp += 8; bp += 8;
+        }
+      }
+      else
       {
-        __m256i my  = _mm256_load_si256((__m256i*)y);
-        __m256i mcb = _mm256_load_si256((__m256i*)cb);
-        __m256i mcr = _mm256_load_si256((__m256i*)cr);
-
-        __m256i t = _mm256_add_epi32(mcb, mcr);
-        t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2));
-        _mm256_store_si256((__m256i*)g, t);
-        __m256i u = _mm256_add_epi32(mcb, t);
-        _mm256_store_si256((__m256i*)b, u);
-        u = _mm256_add_epi32(mcr, t);
-        _mm256_store_si256((__m256i*)r, u);
-
-        y += 8; cb += 8; cr += 8;
-        r += 8; g += 8; b += 8;
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) &&
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
+        __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
+                                             0, (si64)ULLONG_MAX);
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i my, mcb, mcr, tr, tg, tb;
+          my  = _mm256_load_si256((__m256i*)yp);
+          mcb = _mm256_load_si256((__m256i*)cbp);
+          mcr = _mm256_load_si256((__m256i*)crp);
+
+          tg = _mm256_add_epi64(mcb, mcr);
+          tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
+          tb = _mm256_add_epi64(mcb, tg);
+          tr = _mm256_add_epi64(mcr, tg);
+
+          __m256i mr, mg, mb;
+          mr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
+          mr = _mm256_and_si256(low_bits, mr);
+          mg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
+          mg = _mm256_and_si256(low_bits, mg);
+          mb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
+          mb = _mm256_and_si256(low_bits, mb);
+
+          yp += 4; cbp += 4; crp += 4;
+
+          my  = _mm256_load_si256((__m256i*)yp);
+          mcb = _mm256_load_si256((__m256i*)cbp);
+          mcr = _mm256_load_si256((__m256i*)crp);
+
+          tg = _mm256_add_epi64(mcb, mcr);
+          tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
+          tb = _mm256_add_epi64(mcb, tg);
+          tr = _mm256_add_epi64(mcr, tg);
+
+          tr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
+          tr = _mm256_andnot_si256(low_bits, tr);
+          mr = _mm256_or_si256(mr, tr);
+          mr = _mm256_permute4x64_epi64(mr, _MM_SHUFFLE(3, 1, 2, 0));
+
+          tg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
+          tg = _mm256_andnot_si256(low_bits, tg);
+          mg = _mm256_or_si256(mg, tg);
+          mg = _mm256_permute4x64_epi64(mg, _MM_SHUFFLE(3, 1, 2, 0));
+
+          tb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
+          tb = _mm256_andnot_si256(low_bits, tb);
+          mb = _mm256_or_si256(mb, tb);
+          mb = _mm256_permute4x64_epi64(mb, _MM_SHUFFLE(3, 1, 2, 0));
+
+          _mm256_store_si256((__m256i*)rp, mr);
+          _mm256_store_si256((__m256i*)gp, mg);
+          _mm256_store_si256((__m256i*)bp, mb);
+
+          yp += 4; cbp += 4; crp += 4;
+          rp += 8; gp += 8; bp += 8;
+        }
       }
     }
 
   }
 }
+
+#endif
diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h
index 6ddf8900..a85bf8bd 100644
--- a/src/core/transform/ojph_colour_local.h
+++ b/src/core/transform/ojph_colour_local.h
@@ -65,32 +65,46 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                      ui32 width);
+    void gen_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width);
+    void gen_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width);
+    void gen_irv_convert_to_float(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width);
+    void gen_irv_convert_to_integer(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width);
+    void gen_irv_convert_to_float_nlt_type3(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                         si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void gen_irv_convert_to_integer_nlt_type3(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                          si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void gen_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     void gen_ict_forward(const float *r, const float *g, const float *b,
@@ -108,22 +122,6 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width);
-
     //////////////////////////////////////////////////////////////////////////
     void sse_ict_forward(const float *r, const float *g, const float *b,
                          float *y, float *cb, float *cr, ui32 repeat);
@@ -141,12 +139,14 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                        ui32 width);
+    void sse2_irv_convert_to_integer(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                  ui32 width);
+    void sse2_irv_convert_to_integer_nlt_type3(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -157,40 +157,44 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width);
+    void sse2_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void sse2_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void sse2_irv_convert_to_float(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    //
-    //
-    //                       AVX Functions (float)
-    //
-    //
-    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_float_nlt_type3(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width);
+    void sse2_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width);
+    void sse2_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width);
-
+    //
+    //
+    //                       AVX Functions (float)
+    //
+    //
     //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     void avx_ict_forward(const float *r, const float *g, const float *b,
@@ -209,16 +213,46 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width);
+    void avx2_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_integer(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void avx2_irv_convert_to_float(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void avx2_irv_convert_to_integer_nlt_type3(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_float_nlt_type3(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -229,32 +263,46 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                        ui32 width);
+    void wasm_irv_convert_to_integer(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_float(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                  ui32 width);
+    void wasm_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                        ui32 width);
+    void wasm_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                  ui32 width);
+    void wasm_irv_convert_to_integer_nlt_type3(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width);
+    void wasm_irv_convert_to_float_nlt_type3(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void wasm_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void wasm_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     void wasm_ict_forward(const float *r, const float *g, const float *b,
diff --git a/src/core/transform/ojph_colour_sse.cpp b/src/core/transform/ojph_colour_sse.cpp
index 89cc86c2..344f4987 100644
--- a/src/core/transform/ojph_colour_sse.cpp
+++ b/src/core/transform/ojph_colour_sse.cpp
@@ -35,100 +35,20 @@
 // Date: 11 October 2019
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
 #include <cmath>
 
 #include "ojph_defs.h"
-#include "ojph_arch.h"
 #include "ojph_colour.h"
 #include "ojph_colour_local.h"
 
-#include <immintrin.h>
+#include <xmmintrin.h>
 
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width)
-    {
-      __m128 shift = _mm_set1_ps(0.5f);
-      __m128 m = _mm_set1_ps(mul);
-      for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
-      {
-        __m128i t = _mm_castps_si128(_mm_loadu_ps((float*)sp));
-        __m128 s = _mm_cvtepi32_ps(t);
-        s = _mm_mul_ps(s, m);
-        s = _mm_sub_ps(s, shift);
-        _mm_store_ps(dp, s);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width)
-    {
-      __m128 m = _mm_set1_ps(mul);
-      for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
-      {
-        __m128i t = _mm_castps_si128(_mm_loadu_ps((float*)sp));
-        __m128 s = _mm_cvtepi32_ps(t);
-        s = _mm_mul_ps(s, m);
-        _mm_store_ps(dp, s);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width)
-    {
-      uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
-      _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
-      __m128 shift = _mm_set1_ps(0.5f);
-      __m128 m = _mm_set1_ps(mul);
-      for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4)
-      {
-        __m128 t = _mm_load_ps(sp);
-        __m128 s = _mm_add_ps(t, shift);
-        s = _mm_mul_ps(s, m);
-        // the following is a poorly designed code, but it is the only
-        // code that I am aware of that compiles on VS 32 and 64 modes
-        t = s;
-        *dp++ = _mm_cvtss_si32(t); 
-        t = _mm_shuffle_ps(s, s, 1);
-        *dp++ = _mm_cvtss_si32(t); 
-        t = _mm_shuffle_ps(s, s, 2);
-        *dp++ = _mm_cvtss_si32(t); 
-        t = _mm_shuffle_ps(s, s, 3);
-        *dp++ = _mm_cvtss_si32(t);
-      }
-      _MM_SET_ROUNDING_MODE(rounding_mode);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width)
-    {
-      uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
-      _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
-      __m128 m = _mm_set1_ps(mul);
-      for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4)
-      {
-        __m128 t = _mm_load_ps(sp);
-        __m128 s = _mm_mul_ps(t, m);
-        // the following is a poorly designed code, but it is the only
-        // code that I am aware of that compiles on VS 32 and 64 modes
-        t = s;
-        *dp++ = _mm_cvtss_si32(t);
-        t = _mm_shuffle_ps(s, s, 1);
-        *dp++ = _mm_cvtss_si32(t);
-        t = _mm_shuffle_ps(s, s, 2);
-        *dp++ = _mm_cvtss_si32(t);
-        t = _mm_shuffle_ps(s, s, 3);
-        *dp++ = _mm_cvtss_si32(t);
-      }
-      _MM_SET_ROUNDING_MODE(rounding_mode);
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void sse_ict_forward(const float *r, const float *g, const float *b,
                          float *y, float *cb, float *cr, ui32 repeat)
@@ -178,3 +98,5 @@ namespace ojph {
     }
   }
 }
+
+#endif
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index 4bb56f29..2bfb08c0 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -35,13 +35,17 @@
 // Date: 11 October 2019
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
+#include <climits>
 #include <cmath>
 
 #include "ojph_defs.h"
-#include "ojph_arch.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 
-#include <immintrin.h>
+#include <emmintrin.h>
 
 namespace ojph {
   namespace local {
@@ -66,7 +70,7 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width)
+                                  ui32 width)
     {
       uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
       _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
@@ -80,64 +84,579 @@ namespace ojph {
       _MM_SET_ROUNDING_MODE(rounding_mode);
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    __m128i ojph_mm_max_ge_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
+    {
+      __m128 ct = _mm_cmpge_ps(x, y);     // 0xFFFFFFFF for x >= y
+      __m128i c = _mm_castps_si128(ct);   // does not generate any code
+      __m128i d = _mm_and_si128(c, a);    // keep only a, where x >= y
+      __m128i e = _mm_andnot_si128(c, b); // keep only b, where x <  y
+      return _mm_or_si128(d, e);          // combine
+    }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width)
+    static inline
+    __m128i ojph_mm_min_lt_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
     {
-      __m128i sh = _mm_set1_epi32(shift);
-      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      __m128 ct = _mm_cmplt_ps(x, y);     // 0xFFFFFFFF for x < y
+      __m128i c = _mm_castps_si128(ct);   // does not generate any code
+      __m128i d = _mm_and_si128(c, a);    // keep only a, where x <  y
+      __m128i e = _mm_andnot_si128(c, b); // keep only b, where x >= y
+      return _mm_or_si128(d, e);          // combine
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    template <bool NLT_TYPE3>
+    static inline
+    void local_sse2_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER));
+
+      assert(bit_depth <= 32);
+      uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
+      _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+
+      const float* sp = src_line->f32;
+      si32* dp = dst_line->i32 + dst_line_offset;
+      // There is the possibility that converting to integer will
+      // exceed the dynamic range of 32bit integer; therefore, care must be
+      // exercised.
+      // We look if the floating point number is outside the half-closed
+      // interval [-0.5f, 0.5f). If so, we limit the resulting integer
+      // to the maximum/minimum that number supports.
+      si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
+      __m128 mul = _mm_set1_ps((float)(1ull << bit_depth));
+      __m128 fl_up_lim = _mm_set1_ps(-(float)neg_limit); // val < upper
+      __m128 fl_low_lim = _mm_set1_ps((float)neg_limit); // val >= lower
+      __m128i s32_up_lim = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
+      __m128i s32_low_lim = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
+
+      if (is_signed)
+      {
+        __m128i zero = _mm_setzero_si128();
+        __m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          __m128 t = _mm_loadu_ps(sp);
+          t = _mm_mul_ps(t, mul);
+          __m128i u = _mm_cvtps_epi32(t);
+          u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
+          if (NLT_TYPE3)
+          {
+            __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
+            __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
+            neg = _mm_and_si128(c, neg);          //keep only - bias - value
+            u = _mm_andnot_si128(c, u);           //keep only +ve or 0
+            u = _mm_or_si128(neg, u);             //combine
+          }
+          _mm_storeu_si128((__m128i*)dp, u);
+        }
+      }
+      else
+      {
+        __m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1)));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          __m128 t = _mm_loadu_ps(sp);
+          t = _mm_mul_ps(t, mul);
+          __m128i u = _mm_cvtps_epi32(t);
+          u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
+          u = _mm_add_epi32(u, half);
+          _mm_storeu_si128((__m128i*)dp, u);
+        }
+      }
+
+      _MM_SET_ROUNDING_MODE(rounding_mode);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_sse2_irv_convert_to_integer<false>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_sse2_irv_convert_to_integer<true>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
+    {
+      // note than m must be obtained using
+      // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
+      __m128i x = _mm_srli_epi64(a, amt);
+      x = _mm_xor_si128(x, m);
+      __m128i result = _mm_sub_epi64(x, m);
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
+    {
+      __m128i t;
+      t = _mm_cmplt_epi32(a, zero);      // get -ve
+      t = _mm_unpacklo_epi32(a, t);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
+    {
+      __m128i t;
+      t = _mm_cmplt_epi32(a, zero);      // get -ve
+      t = _mm_unpackhi_epi32(a, t);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_convert(const line_buf *src_line,
+                          const ui32 src_line_offset,
+                          line_buf *dst_line,
+                          const ui32 dst_line_offset,
+                          si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      {
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m128i sh = _mm_set1_epi32((si32)shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            __m128i s = _mm_loadu_si128((__m128i*)sp);
+            s = _mm_add_epi32(s, sh);
+            _mm_storeu_si128((__m128i*)dp, s);
+          }
+        }
+        else
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m128i zero = _mm_setzero_si128();
+          __m128i sh = _mm_set1_epi64x(shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            __m128i s, t;
+            s = _mm_loadu_si128((__m128i*)sp);
+
+            t = sse2_cvtlo_epi32_epi64(s, zero);
+            t = _mm_add_epi64(t, sh);
+            _mm_storeu_si128((__m128i*)dp, t);
+
+            t = sse2_cvthi_epi32_epi64(s, zero);
+            t = _mm_add_epi64(t, sh);
+            _mm_storeu_si128((__m128i*)dp + 1, t);
+          }
+        }
+      }
+      else
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
+        __m128i sh = _mm_set1_epi64x(shift);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+        {
+          __m128i s, t;
+          s = _mm_loadu_si128((__m128i*)sp);
+          s = _mm_add_epi64(s, sh);
+
+          t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm_and_si128(low_bits, t);
+
+          s = _mm_loadu_si128((__m128i*)sp + 1);
+          s = _mm_add_epi64(s, sh);
+
+          s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
+          s = _mm_andnot_si128(low_bits, s);
+
+          t = _mm_or_si128(s, t);
+          _mm_storeu_si128((__m128i*)dp, t);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_convert_nlt_type3(const line_buf *src_line,
+                                    const ui32 src_line_offset,
+                                    line_buf *dst_line,
+                                    const ui32 dst_line_offset,
+                                    si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
       {
-        __m128i s = _mm_loadu_si128((__m128i*)sp);
-        s = _mm_add_epi32(s, sh);
-        _mm_storeu_si128((__m128i*)dp, s);
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m128i sh = _mm_set1_epi32((si32)(-shift));
+          __m128i zero = _mm_setzero_si128();
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            __m128i s = _mm_loadu_si128((__m128i*)sp);
+            __m128i c = _mm_cmplt_epi32(s, zero);  // 0xFFFFFFFF for -ve value
+            __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value
+            v_m_sh = _mm_and_si128(c, v_m_sh);     // keep only - shift - value
+            s = _mm_andnot_si128(c, s);            // keep only +ve or 0
+            s = _mm_or_si128(s, v_m_sh);           // combine
+            _mm_storeu_si128((__m128i*)dp, s);
+          }
+        }
+        else
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m128i sh = _mm_set1_epi64x(-shift);
+          __m128i zero = _mm_setzero_si128();
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            __m128i s, t, u, c, v_m_sh;
+            s = _mm_loadu_si128((__m128i*)sp);
+
+            t = _mm_cmplt_epi32(s, zero);      // find -ve 32bit -1
+            u = _mm_unpacklo_epi32(s, t);      // correct 64bit data
+            c = _mm_unpacklo_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm_sub_epi64(sh, u);     // - shift - value
+            v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
+            u = _mm_andnot_si128(c, u);        // keep only +ve or 0
+            u = _mm_or_si128(u, v_m_sh);       // combine
+
+            _mm_storeu_si128((__m128i*)dp, u);
+            u = _mm_unpackhi_epi32(s, t);      // correct 64bit data
+            c = _mm_unpackhi_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm_sub_epi64(sh, u);     // - shift - value
+            v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
+            u = _mm_andnot_si128(c, u);        // keep only +ve or 0
+            u = _mm_or_si128(u, v_m_sh);       // combine
+
+            _mm_storeu_si128((__m128i*)dp + 1, u);
+          }
+        }
+      }
+      else
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m128i sh = _mm_set1_epi64x(-shift);
+        __m128i zero = _mm_setzero_si128();
+        __m128i half_mask = _mm_set_epi64x(0, (si64)ULLONG_MAX);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+        {
+          // s for source, t for target, p for positive, n for negative,
+          // m for mask, and tm for temp
+          __m128i s, t, p, n, m, tm;
+          s = _mm_loadu_si128((__m128i*)sp);
+
+          tm = _mm_cmplt_epi32(s, zero);   // 32b -1 for -ve value
+          m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
+          tm = _mm_sub_epi64(sh, s);       // - shift - value
+          n = _mm_and_si128(m, tm);        // -ve
+          p = _mm_andnot_si128(m, s);      // +ve
+          tm = _mm_or_si128(n, p);
+          tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm_and_si128(half_mask, tm);
+
+          s = _mm_loadu_si128((__m128i*)sp + 1);
+          tm = _mm_cmplt_epi32(s, zero);   // 32b -1 for -ve value
+          m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
+          tm = _mm_sub_epi64(sh, s);       // - shift - value
+          n = _mm_and_si128(m, tm);        // -ve
+          p = _mm_andnot_si128(m, s);      // +ve
+          tm = _mm_or_si128(n, p);
+          tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
+          tm = _mm_andnot_si128(half_mask, tm);
+
+          t = _mm_or_si128(t, tm);
+           _mm_storeu_si128((__m128i*)dp, t);
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat)
+    template<bool NLT_TYPE3>
+    static inline
+    void local_sse2_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
     {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
+
+      assert(bit_depth <= 32);
+      __m128 mul = _mm_set1_ps((float)(1.0 / (double)(1ULL << bit_depth)));
+
+      const si32* sp = src_line->i32 + src_line_offset;
+      float* dp = dst_line->f32;
+      if (is_signed)
       {
-        __m128i mr = _mm_load_si128((__m128i*)r);
-        __m128i mg = _mm_load_si128((__m128i*)g);
-        __m128i mb = _mm_load_si128((__m128i*)b);
-        __m128i t = _mm_add_epi32(mr, mb);
-        t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
-        _mm_store_si128((__m128i*)y, _mm_srai_epi32(t, 2));
-        t = _mm_sub_epi32(mb, mg);
-        _mm_store_si128((__m128i*)cb, t);
-        t = _mm_sub_epi32(mr, mg);
-        _mm_store_si128((__m128i*)cr, t);
-
-        r += 4; g += 4; b += 4;
-        y += 4; cb += 4; cr += 4;
+        __m128i zero = _mm_setzero_si128();
+        __m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          __m128i t = _mm_loadu_si128((__m128i*)sp);
+          if (NLT_TYPE3)
+          {
+            __m128i c = _mm_cmplt_epi32(t, zero); // 0xFFFFFFFF for -ve value
+            __m128i neg = _mm_sub_epi32(bias, t); // - bias - value
+            neg = _mm_and_si128(c, neg);          // keep only - bias - value
+            c = _mm_andnot_si128(c, t);           // keep only +ve or 0
+            t = _mm_or_si128(neg, c);             // combine
+          }
+          __m128 v = _mm_cvtepi32_ps(t);
+          v = _mm_mul_ps(v, mul);
+          _mm_storeu_ps(dp, v);
+        }
       }
+      else
+      {
+        __m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1)));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          __m128i t = _mm_loadu_si128((__m128i*)sp);
+          t = _mm_sub_epi32(t, half);
+          __m128 v = _mm_cvtepi32_ps(t);
+          v = _mm_mul_ps(v, mul);
+          _mm_storeu_ps(dp, v);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_sse2_irv_convert_to_float<false>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_sse2_irv_convert_to_float<true>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat)
+    void sse2_rct_forward(const line_buf *r,
+                          const line_buf *g,
+                          const line_buf *b,
+                          line_buf *y, line_buf *cb, line_buf *cr,
+                          ui32 repeat)
     {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) &&
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) &&
+             (b->flags  & line_buf::LFT_INTEGER));
+
+      if  (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) &&
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i mr = _mm_load_si128((__m128i*)rp);
+          __m128i mg = _mm_load_si128((__m128i*)gp);
+          __m128i mb = _mm_load_si128((__m128i*)bp);
+          __m128i t = _mm_add_epi32(mr, mb);
+          t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
+          _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2));
+          t = _mm_sub_epi32(mb, mg);
+          _mm_store_si128((__m128i*)cbp, t);
+          t = _mm_sub_epi32(mr, mg);
+          _mm_store_si128((__m128i*)crp, t);
+
+          rp += 4; gp += 4; bp += 4;
+          yp += 4; cbp += 4; crp += 4;
+        }
+      }
+      else
       {
-        __m128i my  = _mm_load_si128((__m128i*)y);
-        __m128i mcb = _mm_load_si128((__m128i*)cb);
-        __m128i mcr = _mm_load_si128((__m128i*)cr);
-
-        __m128i t = _mm_add_epi32(mcb, mcr);
-        t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
-        _mm_store_si128((__m128i*)g, t);
-        __m128i u = _mm_add_epi32(mcb, t);
-        _mm_store_si128((__m128i*)b, u);
-        u = _mm_add_epi32(mcr, t);
-        _mm_store_si128((__m128i*)r, u);
-
-        y += 4; cb += 4; cr += 4;
-        r += 4; g += 4; b += 4;
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) &&
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        __m128i zero = _mm_setzero_si128();
+        __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i mr32 = _mm_load_si128((__m128i*)rp);
+          __m128i mg32 = _mm_load_si128((__m128i*)gp);
+          __m128i mb32 = _mm_load_si128((__m128i*)bp);
+          __m128i mr, mg, mb, t;
+          mr = sse2_cvtlo_epi32_epi64(mr32, zero);
+          mg = sse2_cvtlo_epi32_epi64(mg32, zero);
+          mb = sse2_cvtlo_epi32_epi64(mb32, zero);
+
+          t = _mm_add_epi64(mr, mb);
+          t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
+          _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
+          t = _mm_sub_epi64(mb, mg);
+          _mm_store_si128((__m128i*)cbp, t);
+          t = _mm_sub_epi64(mr, mg);
+          _mm_store_si128((__m128i*)crp, t);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          mr = sse2_cvthi_epi32_epi64(mr32, zero);
+          mg = sse2_cvthi_epi32_epi64(mg32, zero);
+          mb = sse2_cvthi_epi32_epi64(mb32, zero);
+
+          t = _mm_add_epi64(mr, mb);
+          t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
+          _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
+          t = _mm_sub_epi64(mb, mg);
+          _mm_store_si128((__m128i*)cbp, t);
+          t = _mm_sub_epi64(mr, mg);
+          _mm_store_si128((__m128i*)crp, t);
+
+          rp += 4; gp += 4; bp += 4;
+          yp += 2; cbp += 2; crp += 2;
+        }
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rct_backward(const line_buf *y,
+                           const line_buf *cb,
+                           const line_buf *cr,
+                           line_buf *r, line_buf *g, line_buf *b,
+                           ui32 repeat)
+    {
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) &&
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) &&
+             (b->flags  & line_buf::LFT_INTEGER));
+
+      if (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) &&
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i my  = _mm_load_si128((__m128i*)yp);
+          __m128i mcb = _mm_load_si128((__m128i*)cbp);
+          __m128i mcr = _mm_load_si128((__m128i*)crp);
+
+          __m128i t = _mm_add_epi32(mcb, mcr);
+          t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
+          _mm_store_si128((__m128i*)gp, t);
+          __m128i u = _mm_add_epi32(mcb, t);
+          _mm_store_si128((__m128i*)bp, u);
+          u = _mm_add_epi32(mcr, t);
+          _mm_store_si128((__m128i*)rp, u);
+
+          yp += 4; cbp += 4; crp += 4;
+          rp += 4; gp += 4; bp += 4;
+        }
+      }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) &&
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
+        __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i my, mcb, mcr, tr, tg, tb;
+          my  = _mm_load_si128((__m128i*)yp);
+          mcb = _mm_load_si128((__m128i*)cbp);
+          mcr = _mm_load_si128((__m128i*)crp);
+
+          tg = _mm_add_epi64(mcb, mcr);
+          tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
+          tb = _mm_add_epi64(mcb, tg);
+          tr = _mm_add_epi64(mcr, tg);
+
+          __m128i mr, mg, mb;
+          mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
+          mr = _mm_and_si128(low_bits, mr);
+          mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
+          mg = _mm_and_si128(low_bits, mg);
+          mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
+          mb = _mm_and_si128(low_bits, mb);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          my  = _mm_load_si128((__m128i*)yp);
+          mcb = _mm_load_si128((__m128i*)cbp);
+          mcr = _mm_load_si128((__m128i*)crp);
+
+          tg = _mm_add_epi64(mcb, mcr);
+          tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
+          tb = _mm_add_epi64(mcb, tg);
+          tr = _mm_add_epi64(mcr, tg);
+
+          tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
+          tr = _mm_andnot_si128(low_bits, tr);
+          mr = _mm_or_si128(mr, tr);
+          tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
+          tg = _mm_andnot_si128(low_bits, tg);
+          mg = _mm_or_si128(mg, tg);
+          tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
+          tb = _mm_andnot_si128(low_bits, tb);
+          mb = _mm_or_si128(mb, tb);
+
+          _mm_store_si128((__m128i*)rp, mr);
+          _mm_store_si128((__m128i*)gp, mg);
+          _mm_store_si128((__m128i*)bp, mb);
+
+          yp += 2; cbp += 2; crp += 2;
+          rp += 4; gp += 4; bp += 4;
+        }
+      }
+    }
   }
 }
+
+#endif
diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index 632a6454..aa9a79eb 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2021, Aous Naman 
+// Copyright (c) 2021, Aous Naman
 // Copyright (c) 2021, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2021, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -35,16 +35,183 @@
 // Date: 9 February 2021
 //***************************************************************************/
 
+#include <climits>
 #include <cmath>
 #include <wasm_simd128.h>
 
 #include "ojph_defs.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 #include "ojph_colour_local.h"
 
 namespace ojph {
   namespace local {
-    
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    v128_t ojph_convert_float_to_i32(v128_t a, v128_t zero, v128_t half)
+    { // We implement ojph_round, which is
+      // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float
+      v128_t c = wasm_f32x4_ge(a, zero);   // greater or equal to zero
+      v128_t p = wasm_f32x4_add(a, half);  // for positive, add half
+      v128_t n = wasm_f32x4_sub(a, half);  // for negative, subtract half
+      v128_t d = wasm_v128_and(c, p);      // keep positive only
+      v128_t e = wasm_v128_andnot(n, c);   // keep negative only
+      v128_t v = wasm_v128_or(d, e);       // combine
+      return wasm_i32x4_trunc_sat_f32x4(v);// truncate (towards 0)
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_convert(const line_buf *src_line,
+                          const ui32 src_line_offset,
+                          line_buf *dst_line,
+                          const ui32 dst_line_offset,
+                          si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      {
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          v128_t sh = wasm_i32x4_splat((si32)shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            v128_t s = wasm_v128_load(sp);
+            s = wasm_i32x4_add(s, sh);
+            wasm_v128_store(dp, s);
+          }
+        }
+        else
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          v128_t sh = wasm_i64x2_splat(shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            v128_t s, t;
+            s = wasm_v128_load(sp);
+
+            t = wasm_i64x2_extend_low_i32x4(s);
+            t = wasm_i64x2_add(t, sh);
+            wasm_v128_store(dp, t);
+
+            t = wasm_i64x2_extend_high_i32x4(s);
+            t = wasm_i64x2_add(t, sh);
+            wasm_v128_store(dp + 2, t);
+          }
+        }
+      }
+      else
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        v128_t sh = wasm_i64x2_splat(shift);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+        {
+          v128_t s0, s1;
+          s0 = wasm_v128_load(sp);
+          s0 = wasm_i64x2_add(s0, sh);
+          s1 = wasm_v128_load(sp + 2);
+          s1 = wasm_i64x2_add(s1, sh);
+          s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2);
+          wasm_v128_store(dp, s0);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_convert_nlt_type3(const line_buf *src_line,
+                                    const ui32 src_line_offset,
+                                    line_buf *dst_line,
+                                    const ui32 dst_line_offset,
+                                    si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      {
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          v128_t sh = wasm_i32x4_splat((si32)(-shift));
+          v128_t zero = wasm_i32x4_splat(0);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            v128_t s = wasm_v128_load(sp);
+            v128_t c = wasm_i32x4_lt(s, zero);     // 0xFFFFFFFF for -ve value
+            v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value
+            v_m_sh = wasm_v128_and(c, v_m_sh);     // keep only - shift - value
+            s = wasm_v128_andnot(s, c);            // keep only +ve or 0
+            s = wasm_v128_or(s, v_m_sh);           // combine
+            wasm_v128_store(dp, s);
+          }
+        }
+        else
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          v128_t sh = wasm_i64x2_splat(-shift);
+          v128_t zero = wasm_i32x4_splat(0);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            v128_t s, u, c, v_m_sh;
+            s = wasm_v128_load(sp);
+
+            u = wasm_i64x2_extend_low_i32x4(s);
+            c = wasm_i64x2_lt(u, zero);        // 64b -1 for -ve value
+            v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value
+            v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
+            u = wasm_v128_andnot(u, c);        // keep only +ve or 0
+            u = wasm_v128_or(u, v_m_sh);       // combine
+
+            wasm_v128_store(dp, u);
+
+            u = wasm_i64x2_extend_high_i32x4(s);
+            c = wasm_i64x2_lt(u, zero);        // 64b -1 for -ve value
+            v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value
+            v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
+            u = wasm_v128_andnot(u, c);        // keep only +ve or 0
+            u = wasm_v128_or(u, v_m_sh);       // combine
+
+            wasm_v128_store(dp + 2, u);
+          }
+        }
+      }
+      else
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        v128_t sh = wasm_i64x2_splat(-shift);
+        v128_t zero = wasm_i32x4_splat(0);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+        {
+          // s for source, t for target, p for positive, n for negative,
+          // m for mask, and tm for temp
+          v128_t s, t0, t1, p, n, m, tm;
+          s = wasm_v128_load(sp);
+          m = wasm_i64x2_lt(s, zero);   // 64b -1 for -ve value
+          tm = wasm_i64x2_sub(sh, s);   // - shift - value
+          n = wasm_v128_and(m, tm);     // -ve
+          p = wasm_v128_andnot(s, m);   // +ve
+          t0 = wasm_v128_or(n, p);
+
+          s = wasm_v128_load(sp + 2);
+          m = wasm_i64x2_lt(s, zero);   // 64b -1 for -ve value
+          tm = wasm_i64x2_sub(sh, s);   // - shift - value
+          n = wasm_v128_and(m, tm);     // -ve
+          p = wasm_v128_andnot(s, m);   // +ve
+          t1 = wasm_v128_or(n, p);
+
+          t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2);
+          wasm_v128_store(dp, t0);
+        }
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
                                         ui32 width)
@@ -79,16 +246,16 @@ namespace ojph {
     void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
                                         ui32 width)
     {
-      // rounding mode is always set to _MM_ROUND_NEAREST
-      v128_t shift = wasm_f32x4_splat(0.5f);
+      const v128_t zero = wasm_f32x4_splat(0.0f);
+      const v128_t half = wasm_f32x4_splat(0.5f);
       v128_t m = wasm_f32x4_splat(mul);
       for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
       {
         v128_t t = wasm_v128_load(sp);
-        v128_t s = wasm_f32x4_add(t, shift);
+        v128_t s = wasm_f32x4_add(t, half);
         s = wasm_f32x4_mul(s, m);
-        s = wasm_f32x4_add(s, shift); // + 0.5 and followed by floor next
-        wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s));
+        s = wasm_f32x4_add(s, half); // + 0.5 and followed by floor next
+        wasm_v128_store(dp, ojph_convert_float_to_i32(s, zero, half));
       }
     }
 
@@ -96,74 +263,366 @@ namespace ojph {
     void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
                                   ui32 width)
     {
-      // rounding mode is always set to _MM_ROUND_NEAREST
-      v128_t shift = wasm_f32x4_splat(0.5f);
+      const v128_t zero = wasm_f32x4_splat(0.0f);
+      const v128_t half = wasm_f32x4_splat(0.5f);
       v128_t m = wasm_f32x4_splat(mul);
       for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
       {
         v128_t t = wasm_v128_load(sp);
         v128_t s = wasm_f32x4_mul(t, m);
-        s = wasm_f32x4_add(s, shift); // + 0.5 and followed by floor next
-        wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s));
+        s = wasm_f32x4_add(s, half); // + 0.5 and followed by floor next
+        wasm_v128_store(dp, ojph_convert_float_to_i32(s, zero, half));
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    v128_t ojph_wasm_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y)
+    {
+      v128_t c = wasm_f32x4_ge(x, y);    // 0xFFFFFFFF for x >= y
+      v128_t d = wasm_v128_and(c, a);    // keep only a, where x >= y
+      v128_t e = wasm_v128_andnot(b, c); // keep only b, where x <  y
+      return wasm_v128_or(d, e);         // combine
+    }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width)
+    static inline
+    v128_t ojph_wasm_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y)
     {
-      v128_t sh = wasm_i32x4_splat(shift);
-      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      v128_t c = wasm_f32x4_lt(x, y);    // 0xFFFFFFFF for x < y
+      v128_t d = wasm_v128_and(c, a);    // keep only a, where x <  y
+      v128_t e = wasm_v128_andnot(b, c); // keep only b, where x >= y
+      return wasm_v128_or(d, e);         // combine
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    template <bool NLT_TYPE3>
+    static inline
+    void local_wasm_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER));
+
+      assert(bit_depth <= 32);
+      const float* sp = src_line->f32;
+      si32* dp = dst_line->i32 + dst_line_offset;
+      // There is the possibility that converting to integer will
+      // exceed the dynamic range of 32bit integer; therefore, care must be
+      // exercised.
+      // We look if the floating point number is outside the half-closed
+      // interval [-0.5f, 0.5f). If so, we limit the resulting integer
+      // to the maximum/minimum that number supports.
+      si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
+      v128_t mul = wasm_f32x4_splat((float)(1ull << bit_depth));
+      v128_t fl_up_lim = wasm_f32x4_splat(-(float)neg_limit); // val < upper
+      v128_t fl_low_lim = wasm_f32x4_splat((float)neg_limit); // val >= lower
+      v128_t s32_up_lim = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
+      v128_t s32_low_lim = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
+
+      if (is_signed)
       {
-        v128_t s = wasm_v128_load(sp);
-        s = wasm_i32x4_add(s, sh);
-        wasm_v128_store(dp, s);
+        const v128_t zero = wasm_f32x4_splat(0.0f);
+        const v128_t half = wasm_f32x4_splat(0.5f);
+        v128_t bias = wasm_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = wasm_v128_load(sp);
+          t = wasm_f32x4_mul(t, mul);
+          v128_t u = ojph_convert_float_to_i32(t, zero, half);
+          u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
+          if (NLT_TYPE3)
+          {
+            v128_t c = wasm_i32x4_gt(zero, u);    // 0xFFFFFFFF for -ve value
+            v128_t neg = wasm_i32x4_sub(bias, u); // -bias -value
+            neg = wasm_v128_and(c, neg);          // keep only - bias - value
+            u = wasm_v128_andnot(u, c);           // keep only +ve or 0
+            u = wasm_v128_or(neg, u);             // combine
+          }
+          wasm_v128_store(dp, u);
+        }
       }
+      else
+      {
+        const v128_t zero = wasm_f32x4_splat(0.0f);
+        const v128_t half = wasm_f32x4_splat(0.5f);
+        v128_t ihalf = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = wasm_v128_load(sp);
+          t = wasm_f32x4_mul(t, mul);
+          v128_t u = ojph_convert_float_to_i32(t, zero, half);
+          u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
+          u = wasm_i32x4_add(u, ihalf);
+          wasm_v128_store(dp, u);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_wasm_irv_convert_to_integer<false>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat)
+    void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
     {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+      local_wasm_irv_convert_to_integer<true>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    template <bool NLT_TYPE3>
+    static inline
+    void local_wasm_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
+
+      assert(bit_depth <= 32);
+      v128_t mul = wasm_f32x4_splat((float)(1.0 / (double)(1ULL << bit_depth)));
+
+      const si32* sp = src_line->i32 + src_line_offset;
+      float* dp = dst_line->f32;
+      if (is_signed)
       {
-        v128_t mr = wasm_v128_load(r);
-        v128_t mg = wasm_v128_load(g);
-        v128_t mb = wasm_v128_load(b);
-        v128_t t = wasm_i32x4_add(mr, mb);
-        t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
-        wasm_v128_store(y, wasm_i32x4_shr(t, 2));
-        t = wasm_i32x4_sub(mb, mg);
-        wasm_v128_store(cb, t);
-        t = wasm_i32x4_sub(mr, mg);
-        wasm_v128_store(cr, t);
+        v128_t zero = wasm_i32x4_splat(0);
+        v128_t bias = wasm_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = wasm_v128_load(sp);
+          if (NLT_TYPE3)
+          {
+            v128_t c = wasm_i32x4_lt(t, zero);    // 0xFFFFFFFF for -ve value
+            v128_t neg = wasm_i32x4_sub(bias, t); // - bias - value
+            neg = wasm_v128_and(c, neg);          // keep only - bias - value
+            c = wasm_v128_andnot(t, c);           // keep only +ve or 0
+            t = wasm_v128_or(neg, c);             // combine
+          }
+          v128_t v = wasm_f32x4_convert_i32x4(t);
+          v = wasm_f32x4_mul(v, mul);
+          wasm_v128_store(dp, v);
+        }
+      }
+      else
+      {
+        v128_t half = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = wasm_v128_load(sp);
+          t = wasm_i32x4_sub(t, half);
+          v128_t v = wasm_f32x4_convert_i32x4(t);
+          v = wasm_f32x4_mul(v, mul);
+          wasm_v128_store(dp, v);
+        }
+      }
+    }
 
-        r += 4; g += 4; b += 4;
-        y += 4; cb += 4; cr += 4;
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_wasm_irv_convert_to_float<false>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_wasm_irv_convert_to_float<true>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rct_forward(const line_buf *r,
+                          const line_buf *g,
+                          const line_buf *b,
+                          line_buf *y, line_buf *cb, line_buf *cr,
+                          ui32 repeat)
+    {
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) &&
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) &&
+             (b->flags  & line_buf::LFT_INTEGER));
+
+      if  (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) &&
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t mr = wasm_v128_load(rp);
+          v128_t mg = wasm_v128_load(gp);
+          v128_t mb = wasm_v128_load(bp);
+          v128_t t = wasm_i32x4_add(mr, mb);
+          t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
+          wasm_v128_store(yp, wasm_i32x4_shr(t, 2));
+          t = wasm_i32x4_sub(mb, mg);
+          wasm_v128_store(cbp, t);
+          t = wasm_i32x4_sub(mr, mg);
+          wasm_v128_store(crp, t);
+
+            rp += 4; gp += 4; bp += 4;
+            yp += 4; cbp += 4; crp += 4;
+        }
+      }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) &&
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t mr32 = wasm_v128_load(rp);
+          v128_t mg32 = wasm_v128_load(gp);
+          v128_t mb32 = wasm_v128_load(bp);
+          v128_t mr, mg, mb, t;
+          mr = wasm_i64x2_extend_low_i32x4(mr32);
+          mg = wasm_i64x2_extend_low_i32x4(mg32);
+          mb = wasm_i64x2_extend_low_i32x4(mb32);
+
+          t = wasm_i64x2_add(mr, mb);
+          t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
+          wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
+          t = wasm_i64x2_sub(mb, mg);
+          wasm_v128_store(cbp, t);
+          t = wasm_i64x2_sub(mr, mg);
+          wasm_v128_store(crp, t);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          mr = wasm_i64x2_extend_high_i32x4(mr32);
+          mg = wasm_i64x2_extend_high_i32x4(mg32);
+          mb = wasm_i64x2_extend_high_i32x4(mb32);
+
+          t = wasm_i64x2_add(mr, mb);
+          t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
+          wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
+          t = wasm_i64x2_sub(mb, mg);
+          wasm_v128_store(cbp, t);
+          t = wasm_i64x2_sub(mr, mg);
+          wasm_v128_store(crp, t);
+
+          rp += 4; gp += 4; bp += 4;
+          yp += 2; cbp += 2; crp += 2;
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat)
+    void wasm_rct_backward(const line_buf *y,
+                           const line_buf *cb,
+                           const line_buf *cr,
+                           line_buf *r, line_buf *g, line_buf *b,
+                           ui32 repeat)
     {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) &&
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) &&
+             (b->flags  & line_buf::LFT_INTEGER));
+
+      if (y->flags & line_buf::LFT_32BIT)
       {
-        v128_t my  = wasm_v128_load(y);
-        v128_t mcb = wasm_v128_load(cb);
-        v128_t mcr = wasm_v128_load(cr);
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) &&
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t my  = wasm_v128_load(yp);
+          v128_t mcb = wasm_v128_load(cbp);
+          v128_t mcr = wasm_v128_load(crp);
 
-        v128_t t = wasm_i32x4_add(mcb, mcr);
-        t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
-        wasm_v128_store(g, t);
-        v128_t u = wasm_i32x4_add(mcb, t);
-        wasm_v128_store(b, u);
-        u = wasm_i32x4_add(mcr, t);
-        wasm_v128_store(r, u);
+          v128_t t = wasm_i32x4_add(mcb, mcr);
+          t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
+          wasm_v128_store(gp, t);
+          v128_t u = wasm_i32x4_add(mcb, t);
+          wasm_v128_store(bp, u);
+          u = wasm_i32x4_add(mcr, t);
+          wasm_v128_store(rp, u);
 
-        y += 4; cb += 4; cr += 4;
-        r += 4; g += 4; b += 4;
+          yp += 4; cbp += 4; crp += 4;
+          rp += 4; gp += 4; bp += 4;
+        }
+      }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) &&
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1;
+          my  = wasm_v128_load(yp);
+          mcb = wasm_v128_load(cbp);
+          mcr = wasm_v128_load(crp);
+
+          tg0 = wasm_i64x2_add(mcb, mcr);
+          tg0 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg0, 2));
+          tb0 = wasm_i64x2_add(mcb, tg0);
+          tr0 = wasm_i64x2_add(mcr, tg0);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          my  = wasm_v128_load(yp);
+          mcb = wasm_v128_load(cbp);
+          mcr = wasm_v128_load(crp);
+
+          tg1 = wasm_i64x2_add(mcb, mcr);
+          tg1 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg1, 2));
+          tb1 = wasm_i64x2_add(mcb, tg1);
+          tr1 = wasm_i64x2_add(mcr, tg1);
+
+          tr0 = wasm_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2);
+          tg0 = wasm_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2);
+          tb0 = wasm_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2);
+
+          wasm_v128_store(rp, tr0);
+          wasm_v128_store(gp, tg0);
+          wasm_v128_store(bp, tb0);
+
+          yp += 2; cbp += 2; crp += 2;
+          rp += 4; gp += 4; bp += 4;
+        }
       }
     }
 
@@ -186,7 +645,7 @@ namespace ojph {
         wasm_v128_store(y, my);
         wasm_v128_store(cb, wasm_f32x4_mul(beta_cbf, wasm_f32x4_sub(mb, my)));
         wasm_v128_store(cr, wasm_f32x4_mul(beta_crf, wasm_f32x4_sub(mr, my)));
-        
+
         r += 4; g += 4; b += 4;
         y += 4; cb += 4; cr += 4;
       }
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index b6919032..f67ea1b6 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -36,14 +36,19 @@
 //***************************************************************************/
 
 #include <cstdio>
+#include <mutex>
 
 #include "ojph_arch.h"
 #include "ojph_mem.h"
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
 
 namespace ojph {
-  struct line_buf;
+
+  // defined elsewhere
+  class line_buf;
 
   namespace local {
 
@@ -52,450 +57,780 @@ namespace ojph {
     /////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void (*rev_vert_wvlt_fwd_predict)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat) = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*rev_vert_wvlt_fwd_update)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat) = NULL;
+    void (*rev_vert_step)
+      (const lifting_step* s, const line_buf* sig, const line_buf* other,
+        const line_buf* aug, ui32 repeat, bool synthesis) = NULL;
 
     /////////////////////////////////////////////////////////////////////////
-    void (*rev_horz_wvlt_fwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
-      = NULL;
+    void (*rev_horz_ana)
+      (const param_atk* atk, const line_buf* ldst, const line_buf* hdst,
+        const line_buf* src, ui32 width, bool even) = NULL;
 
     /////////////////////////////////////////////////////////////////////////
-    void (*rev_vert_wvlt_bwd_predict)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat) = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*rev_vert_wvlt_bwd_update)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat) = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*rev_horz_wvlt_bwd_tx)
-      (line_buf* dst, line_buf *lsrc, line_buf *hsrc, ui32 width, bool even)
-      = NULL;
-
+    void (*rev_horz_syn)
+      (const param_atk* atk, const line_buf* dst, const line_buf* lsrc,
+        const line_buf* hsrc, ui32 width, bool even) = NULL;
+    
     /////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     /////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void (*irrev_vert_wvlt_step)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       int step_num, ui32 repeat) = NULL;
+    void (*irv_vert_step)
+      (const lifting_step* s, const line_buf* sig, const line_buf* other,
+        const line_buf* aug, ui32 repeat, bool synthesis) = NULL;
 
     /////////////////////////////////////////////////////////////////////////
-    void (*irrev_vert_wvlt_K)
-      (const line_buf *src, line_buf *dst, bool L_analysis_or_H_synthesis,
-       ui32 repeat) = NULL;
+    void (*irv_vert_times_K)
+      (float K, const line_buf* aug, ui32 repeat) = NULL;
 
     /////////////////////////////////////////////////////////////////////////
-    void (*irrev_horz_wvlt_fwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
-      = NULL;
+    void (*irv_horz_ana)
+      (const param_atk* atk, const line_buf* ldst, const line_buf* hdst,
+        const line_buf* src, ui32 width, bool even) = NULL;
 
     /////////////////////////////////////////////////////////////////////////
-    void (*irrev_horz_wvlt_bwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
-      = NULL;
-
-    ////////////////////////////////////////////////////////////////////////////
-    static bool wavelet_transform_functions_initialized = false;
+    void (*irv_horz_syn)
+      (const param_atk* atk, const line_buf* dst, const line_buf* lsrc,
+        const line_buf* hsrc, ui32 width, bool even) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
     void init_wavelet_transform_functions()
     {
-      if (wavelet_transform_functions_initialized)
-        return;
-
+      static std::once_flag wavelet_transform_functions_init_flag;
+      std::call_once(wavelet_transform_functions_init_flag, [](){
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
-      rev_vert_wvlt_fwd_predict = gen_rev_vert_wvlt_fwd_predict;
-      rev_vert_wvlt_fwd_update  = gen_rev_vert_wvlt_fwd_update;
-      rev_horz_wvlt_fwd_tx      = gen_rev_horz_wvlt_fwd_tx;
-      rev_vert_wvlt_bwd_predict = gen_rev_vert_wvlt_bwd_predict;
-      rev_vert_wvlt_bwd_update  = gen_rev_vert_wvlt_bwd_update;
-      rev_horz_wvlt_bwd_tx      = gen_rev_horz_wvlt_bwd_tx;
-      irrev_vert_wvlt_step      = gen_irrev_vert_wvlt_step;
-      irrev_vert_wvlt_K         = gen_irrev_vert_wvlt_K;
-      irrev_horz_wvlt_fwd_tx    = gen_irrev_horz_wvlt_fwd_tx;
-      irrev_horz_wvlt_bwd_tx    = gen_irrev_horz_wvlt_bwd_tx;
-
-#ifndef OJPH_DISABLE_INTEL_SIMD
-      int level = get_cpu_ext_level();
-
-      if (level >= X86_CPU_EXT_LEVEL_SSE)
-      {
-        irrev_vert_wvlt_step    = sse_irrev_vert_wvlt_step;
-        irrev_vert_wvlt_K       = sse_irrev_vert_wvlt_K;
-        irrev_horz_wvlt_fwd_tx  = sse_irrev_horz_wvlt_fwd_tx;
-        irrev_horz_wvlt_bwd_tx  = sse_irrev_horz_wvlt_bwd_tx;
-      }
+        rev_vert_step             = gen_rev_vert_step;
+        rev_horz_ana              = gen_rev_horz_ana;
+        rev_horz_syn              = gen_rev_horz_syn;
 
-      if (level >= X86_CPU_EXT_LEVEL_SSE2)
-      {
-        rev_vert_wvlt_fwd_predict = sse2_rev_vert_wvlt_fwd_predict;
-        rev_vert_wvlt_fwd_update  = sse2_rev_vert_wvlt_fwd_update;
-        rev_horz_wvlt_fwd_tx      = sse2_rev_horz_wvlt_fwd_tx;
-        rev_vert_wvlt_bwd_predict = sse2_rev_vert_wvlt_bwd_predict;
-        rev_vert_wvlt_bwd_update  = sse2_rev_vert_wvlt_bwd_update;
-        rev_horz_wvlt_bwd_tx      = sse2_rev_horz_wvlt_bwd_tx;
-      }
+        irv_vert_step             = gen_irv_vert_step;
+        irv_vert_times_K          = gen_irv_vert_times_K;
+        irv_horz_ana              = gen_irv_horz_ana;
+        irv_horz_syn              = gen_irv_horz_syn;
 
-      if (level >= X86_CPU_EXT_LEVEL_AVX)
-      {
-        irrev_vert_wvlt_step   = avx_irrev_vert_wvlt_step;
-        irrev_vert_wvlt_K      = avx_irrev_vert_wvlt_K;
-        irrev_horz_wvlt_fwd_tx = avx_irrev_horz_wvlt_fwd_tx;
-        irrev_horz_wvlt_bwd_tx = avx_irrev_horz_wvlt_bwd_tx;
-      }
+  #ifndef OJPH_DISABLE_SIMD
 
-      if (level >= X86_CPU_EXT_LEVEL_AVX2)
-      {
-        rev_vert_wvlt_fwd_predict = avx2_rev_vert_wvlt_fwd_predict;
-        rev_vert_wvlt_fwd_update  = avx2_rev_vert_wvlt_fwd_update;
-        rev_horz_wvlt_fwd_tx      = avx2_rev_horz_wvlt_fwd_tx;
-        rev_vert_wvlt_bwd_predict = avx2_rev_vert_wvlt_bwd_predict;
-        rev_vert_wvlt_bwd_update  = avx2_rev_vert_wvlt_bwd_update;
-        rev_horz_wvlt_bwd_tx      = avx2_rev_horz_wvlt_bwd_tx;
-      }
-#endif // !OJPH_DISABLE_INTEL_SIMD
+    #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
+
+      #ifndef OJPH_DISABLE_SSE
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE)
+        {
+          irv_vert_step             = sse_irv_vert_step;
+          irv_vert_times_K          = sse_irv_vert_times_K;
+          irv_horz_ana              = sse_irv_horz_ana;
+          irv_horz_syn              = sse_irv_horz_syn;
+        }
+      #endif // !OJPH_DISABLE_SSE
+
+      #ifndef OJPH_DISABLE_SSE2
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2)
+        {
+          rev_vert_step             = sse2_rev_vert_step;
+          rev_horz_ana              = sse2_rev_horz_ana;
+          rev_horz_syn              = sse2_rev_horz_syn;
+        }
+      #endif // !OJPH_DISABLE_SSE2
+
+      #ifndef OJPH_DISABLE_AVX
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX)
+        {
+          irv_vert_step             = avx_irv_vert_step;
+          irv_vert_times_K          = avx_irv_vert_times_K;
+          irv_horz_ana              = avx_irv_horz_ana;      
+          irv_horz_syn              = avx_irv_horz_syn;
+        }
+      #endif // !OJPH_DISABLE_AVX
+
+      #ifndef OJPH_DISABLE_AVX2
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2)
+        {
+          rev_vert_step             = avx2_rev_vert_step;
+          rev_horz_ana              = avx2_rev_horz_ana;
+          rev_horz_syn              = avx2_rev_horz_syn;
+        }
+      #endif // !OJPH_DISABLE_AVX2
+
+      #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512))
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512)
+        {
+          // rev_vert_step             = avx512_rev_vert_step;
+          // rev_horz_ana              = avx512_rev_horz_ana;
+          // rev_horz_syn              = avx512_rev_horz_syn;
+
+          irv_vert_step             = avx512_irv_vert_step;
+          irv_vert_times_K          = avx512_irv_vert_times_K;
+          irv_horz_ana              = avx512_irv_horz_ana;
+          irv_horz_syn              = avx512_irv_horz_syn;
+        }
+      #endif // !OJPH_DISABLE_AVX512
+    
+    #elif defined(OJPH_ARCH_ARM)
+
+    #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
+
+  #endif // !OJPH_DISABLE_SIMD
 
 #else // OJPH_ENABLE_WASM_SIMD
-      rev_vert_wvlt_fwd_predict = wasm_rev_vert_wvlt_fwd_predict;
-      rev_vert_wvlt_fwd_update  = wasm_rev_vert_wvlt_fwd_update;
-      rev_horz_wvlt_fwd_tx      = wasm_rev_horz_wvlt_fwd_tx;
-      rev_vert_wvlt_bwd_predict = wasm_rev_vert_wvlt_bwd_predict;
-      rev_vert_wvlt_bwd_update  = wasm_rev_vert_wvlt_bwd_update;
-      rev_horz_wvlt_bwd_tx      = wasm_rev_horz_wvlt_bwd_tx;
-      irrev_vert_wvlt_step      = wasm_irrev_vert_wvlt_step;
-      irrev_vert_wvlt_K         = wasm_irrev_vert_wvlt_K;
-      irrev_horz_wvlt_fwd_tx    = wasm_irrev_horz_wvlt_fwd_tx;
-      irrev_horz_wvlt_bwd_tx    = wasm_irrev_horz_wvlt_bwd_tx;
+        rev_vert_step             = wasm_rev_vert_step;
+        rev_horz_ana              = wasm_rev_horz_ana;
+        rev_horz_syn              = wasm_rev_horz_syn;
+        
+        irv_vert_step             = wasm_irv_vert_step;
+        irv_vert_times_K          = wasm_irv_vert_times_K;
+        irv_horz_ana              = wasm_irv_horz_ana;
+        irv_horz_syn              = wasm_irv_horz_syn;
 #endif // !OJPH_ENABLE_WASM_SIMD
-
-      wavelet_transform_functions_initialized = true;
+      });
     }
     
     //////////////////////////////////////////////////////////////////////////
-    const float LIFTING_FACTORS::steps[8] =
-    {
-      -1.586134342059924f, -0.052980118572961f, +0.882911075530934f,
-      +0.443506852043971f,
-      +1.586134342059924f, +0.052980118572961f, -0.882911075530934f,
-      -0.443506852043971f
-    };
-    const float LIFTING_FACTORS::K = 1.230174104914001f;
-    const float LIFTING_FACTORS::K_inv  = (float)(1.0 / 1.230174104914001);
-
-    //////////////////////////////////////////////////////////////////////////
 
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_fwd_predict(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                             const line_buf* other, const line_buf* aug, 
+                             ui32 repeat, bool synthesis)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ -= (*src1++ + *src2++) >> 1;
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + *src1++ + *src2++) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + *src1++ + *src2++) >> e;
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (*src1++ + *src2++) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (*src1++ + *src2++) >> e;
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b - (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b - (*src1++ + *src2++)) >> e;
+      }
+      else { // general case
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_fwd_update(const line_buf* line_src1,
-                                      const line_buf* line_src2,
-                                      line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                             const line_buf* other, const line_buf* aug, 
+                             ui32 repeat, bool synthesis)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ += (*src1++ + *src2++ + 2) >> 2;
+      const si64 a = s->rev.Aatk;
+      const si64 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + *src1++ + *src2++) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + *src1++ + *src2++) >> e;
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (*src1++ + *src2++) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (*src1++ + *src2++) >> e;
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b - (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b - (*src1++ + *src2++)) >> e;
+      }
+      else { // general case
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
-                                  line_buf *line_hdst, ui32 width, bool even)
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        gen_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        gen_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                            const line_buf* hdst, const line_buf* src, 
+                            ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *src = line_src->i32;
-        si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const si32* sp = src + (even ? 1 : 0);
-        si32 *dph = hdst;
-        for (ui32 i = H_width; i > 0; --i, sp+=2)
-          *dph++ = sp[0] - ((sp[-1] + sp[1]) >> 1);
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        sp = src + (even ? 0 : 1);
-        const si32* sph = hdst + (even ? 0 : 1);
-        si32 *dpl = ldst;
-        for (ui32 i = L_width; i > 0; --i, sp+=2, sph++)
-          *dpl++ = *sp + ((2 + sph[-1] + sph[0]) >> 2);
+        // combine both lsrc and hsrc into dst
+        si32* dph = hdst->i32;
+        si32* dpl = ldst->i32;
+        si32* sp = src->i32;
+        ui32 w = width;
+        if (!even)
+        {
+          *dph++ = *sp++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dpl++ = *sp++; *dph++ = *sp++;
+        }
+        if (w)
+        {
+          *dpl++ = *sp++; --w;
+        }
+
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp + (even ? 1 : 0);
+          si32* dp = hp;
+          if (a == 1) 
+          { // 5/3 update and any case with a == 1
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b + (sp[-1] + sp[0])) >> e;
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp -= (sp[-1] + sp[0]) >> e;
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b - (sp[-1] + sp[0])) >> e;
+          }
+          else {
+            // general case
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i32[0] = src->i32[0];
+        else
+          hdst->i32[0] = src->i32[0] << 1;
       }
-      else
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                            const line_buf* hdst, const line_buf* src, 
+                            ui32 width, bool even)
+    {
+      if (width > 1)
       {
+        // combine both lsrc and hsrc into dst
+        si64* dph = hdst->i64;
+        si64* dpl = ldst->i64;
+        si64* sp = src->i64;
+        ui32 w = width;
+        if (!even)
+        {
+          *dph++ = *sp++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dpl++ = *sp++; *dph++ = *sp++;
+        }
+        if (w)
+        {
+          *dpl++ = *sp++; --w;
+        }
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si64 a = s->rev.Aatk;
+          const si64 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp + (even ? 1 : 0);
+          si64* dp = hp;
+          if (a == 1) 
+          { // 5/3 update and any case with a == 1
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b + (sp[-1] + sp[0])) >> e;
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp -= (sp[-1] + sp[0]) >> e;
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b - (sp[-1] + sp[0])) >> e;
+          }
+          else {
+            // general case
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
         if (even)
-          line_ldst->i32[0] = line_src->i32[0];
+          ldst->i64[0] = src->i64[0];
         else
-          line_hdst->i32[0] = line_src->i32[0] << 1;
+          hdst->i64[0] = src->i64[0] << 1;
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_bwd_predict(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ += (*src1++ + *src2++) >> 1;
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        gen_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        gen_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_bwd_update(const line_buf* line_src1,
-                                      const line_buf* line_src2,
-                                      line_buf *line_dst, ui32 repeat)
+    static
+    void gen_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                            const line_buf* lsrc, const line_buf* hsrc, 
+                            ui32 width, bool even)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ -= (2 + *src1++ + *src2++) >> 2;
+      if (width > 1)
+      {
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth + (ev ? 0 : 1);
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b + (sp[-1] + sp[0])) >> e;
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp += (sp[-1] + sp[0]) >> e;
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b - (sp[-1] + sp[0])) >> e;
+          }
+          else {
+            // general case
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        si32* sph = hsrc->i32;
+        si32* spl = lsrc->i32;
+        si32* dp = dst->i32;
+        ui32 w = width;
+        if (!even)
+        {
+          *dp++ = *sph++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dp++ = *spl++; *dp++ = *sph++;
+        }
+        if (w)
+        {
+          *dp++ = *spl++; --w;
+        }
+      }
+      else {
+        if (even)
+          dst->i32[0] = lsrc->i32[0];
+        else
+          dst->i32[0] = hsrc->i32[0] >> 1;
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
-                                  line_buf *line_hsrc, ui32 width, bool even)
+    static
+    void gen_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                            const line_buf* lsrc, const line_buf* hsrc, 
+                            ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
-        si32 *dst = line_dst->i32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        const si32 *sph = hsrc + (even ? 0 : 1);
-        si32 *spl = lsrc;
-        for (ui32 i = L_width; i > 0; --i, sph++, spl++)
-          *spl -= ((2 + sph[-1] + sph[0]) >> 2);
-
-        // extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width - 1];
-        // inverse predict and combine
-        si32 *dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        for (ui32 i = L_width + (even ? 0 : 1); i > 0; --i, spl++, sph++)
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si64 a = s->rev.Aatk;
+          const si64 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth + (ev ? 0 : 1);
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b + (sp[-1] + sp[0])) >> e;
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp += (sp[-1] + sp[0]) >> e;
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b - (sp[-1] + sp[0])) >> e;
+          }
+          else {
+            // general case
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        si64* sph = hsrc->i64;
+        si64* spl = lsrc->i64;
+        si64* dp = dst->i64;
+        ui32 w = width;
+        if (!even)
+        {
+          *dp++ = *sph++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dp++ = *spl++; *dp++ = *sph++;
+        }
+        if (w)
         {
-          *dp++ = *spl;
-          *dp++ = *sph + ((spl[0] + spl[1]) >> 1);
+          *dp++ = *spl++; --w;
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->i32[0] = line_lsrc->i32[0];
+          dst->i64[0] = lsrc->i64[0];
         else
-          line_dst->i32[0] = line_hsrc->i32[0] >> 1;
+          dst->i64[0] = hsrc->i64[0] >> 1;
       }
     }
 
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        gen_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        gen_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }    
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irrev_vert_wvlt_step(const line_buf* line_src1,
-                                  const line_buf* line_src2,
-                                  line_buf *line_dst,
-                                  int step_num, ui32 repeat)
+    void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis)
     {
-      float *dst = line_dst->f32;
-      const float *src1 = line_src1->f32, *src2 = line_src2->f32;
-      float factor = LIFTING_FACTORS::steps[step_num];
+      float a = s->irv.Aatk;
+
+      if (synthesis)
+        a = -a;
+
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
       for (ui32 i = repeat; i > 0; --i)
-        *dst++ += factor * (*src1++ + *src2++);
+        *dst++ += a * (*src1++ + *src2++);
     }
 
-    /////////////////////////////////////////////////////////////////////////
-    void gen_irrev_vert_wvlt_K(const line_buf* line_src,
-                               line_buf* line_dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat)
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
     {
-      float *dst = line_dst->f32;
-      const float *src = line_src->f32;
-      float factor = LIFTING_FACTORS::K_inv;
-      factor = L_analysis_or_H_synthesis ? factor : LIFTING_FACTORS::K;
+      float* dst = aug->f32;
       for (ui32 i = repeat; i > 0; --i)
-        *dst++ = *src++ * factor;
+        *dst++ *= K;
     }
 
-
     /////////////////////////////////////////////////////////////////////////
-    void gen_irrev_horz_wvlt_fwd_tx(line_buf* line_src,
-                                    line_buf *line_ldst,
-                                    line_buf *line_hdst,
-                                    ui32 width, bool even)
+    void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *src = line_src->f32;
-        float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        float factor = LIFTING_FACTORS::steps[0];
-        const float* sp = src + (even ? 1 : 0);
-        float *dph = hdst;
-        for (ui32 i = H_width; i > 0; --i, sp+=2)
-          *dph++ = sp[0] + factor * (sp[-1] + sp[1]);
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = LIFTING_FACTORS::steps[1];
-        sp = src + (even ? 0 : 1);
-        const float* sph = hdst + (even ? 0 : 1);
-        float *dpl = ldst;
-        for (ui32 i = L_width; i > 0; --i, sp+=2, sph++)
-          *dpl++ = sp[0] + factor * (sph[-1] + sph[0]);
-
-        //extension
-        ldst[-1] = ldst[0];
-        ldst[L_width] = ldst[L_width-1];
-        //predict
-        factor = LIFTING_FACTORS::steps[2];
-        const float* spl = ldst + (even ? 1 : 0);
-        dph = hdst;
-        for (ui32 i = H_width; i > 0; --i, spl++)
-          *dph++ += factor * (spl[-1] + spl[0]);
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = LIFTING_FACTORS::steps[3];
-        sph = hdst + (even ? 0 : 1);
-        dpl = ldst;
-        for (ui32 i = L_width; i > 0; --i, sph++)
-          *dpl++ += factor * (sph[-1] + sph[0]);
-
-        //multipliers
-        float *dp = ldst;
-        for (ui32 i = L_width; i > 0; --i, dp++)
-          *dp *= LIFTING_FACTORS::K_inv;
-        dp = hdst;
-        for (ui32 i = H_width; i > 0; --i, dp++)
-          *dp *= LIFTING_FACTORS::K;
+        // split src into ldst and hdst
+        float* dph = hdst->f32;
+        float* dpl = ldst->f32;
+        float* sp = src->f32;
+        ui32 w = width;
+        if (!even)
+        {
+          *dph++ = *sp++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dpl++ = *sp++; *dph++ = *sp++;
+        }
+        if (w)
+        {
+          *dpl++ = *sp++; --w;
+        }
+
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp + (even ? 1 : 0);
+          float* dp = hp;
+          for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+            *dp += a * (sp[-1] + sp[0]);
+
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+
+        {
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          float* dp;
+
+          dp = lp;
+          for (ui32 i = l_width; i > 0; --i)
+            *dp++ *= K_inv;
+
+          dp = hp;
+          for (ui32 i = h_width; i > 0; --i)
+            *dp++ *= K;
+        }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->f32[0] = line_src->f32[0];
+          ldst->f32[0] = src->f32[0];
         else
-          line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
+          hdst->f32[0] = src->f32[0] * 2.0f;
       }
     }
-
-    /////////////////////////////////////////////////////////////////////////
-    void gen_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
-                                    line_buf *line_hsrc, ui32 width,
-                                    bool even)
+    
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
-        float *dst = line_dst->f32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //multipliers
-        float *dp = lsrc;
-        for (ui32 i = L_width; i > 0; --i, dp++)
-          *dp *= LIFTING_FACTORS::K;
-        dp = hsrc;
-        for (ui32 i = H_width; i > 0; --i, dp++)
-          *dp *= LIFTING_FACTORS::K_inv;
-
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        float factor = LIFTING_FACTORS::steps[7];
-        const float *sph = hsrc + (even ? 0 : 1);
-        float *dpl = lsrc;
-        for (ui32 i = L_width; i > 0; --i, dpl++, sph++)
-          *dpl += factor * (sph[-1] + sph[0]);
-
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict
-        factor = LIFTING_FACTORS::steps[6];
-        const float *spl = lsrc + (even ? 0 : -1);
-        float *dph = hsrc;
-        for (ui32 i = H_width; i > 0; --i, dph++, spl++)
-          *dph += factor * (spl[0] + spl[1]);
-
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = LIFTING_FACTORS::steps[5];
-        sph = hsrc + (even ? 0 : 1);
-        dpl = lsrc;
-        for (ui32 i = L_width; i > 0; --i, dpl++, sph++)
-          *dpl += factor * (sph[-1] + sph[0]);
-
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict and combine
-        factor = LIFTING_FACTORS::steps[4];
-        dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        for (ui32 i = L_width+(even?0:1); i > 0; --i, spl++, sph++)
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+
+        {
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          float* dp;
+
+          dp = aug;
+          for (ui32 i = aug_width; i > 0; --i)
+            *dp++ *= K;
+
+          dp = oth;
+          for (ui32 i = oth_width; i > 0; --i)
+            *dp++ *= K_inv;
+        }
+
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          *dp++ = *spl;
-          *dp++ = *sph + factor * (spl[0] + spl[1]);
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth + (ev ? 0 : 1);
+          float* dp = aug;
+          for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+            *dp -= a * (sp[-1] + sp[0]);
+
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
+
+        // combine both lsrc and hsrc into dst
+        float* sph = hsrc->f32;
+        float* spl = lsrc->f32;
+        float* dp = dst->f32;
+        ui32 w = width;
+        if (!even)
+        { *dp++ = *sph++; --w; }
+        for (; w > 1; w -= 2)
+        { *dp++ = *spl++; *dp++ = *sph++; }
+        if (w)
+        { *dp++ = *spl++; --w; }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->f32[0] = line_lsrc->f32[0];
+          dst->f32[0] = lsrc->f32[0];
         else
-          line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
       }
     }
 
diff --git a/src/core/transform/ojph_transform.h b/src/core/transform/ojph_transform.h
index 002235d3..f7576a1c 100644
--- a/src/core/transform/ojph_transform.h
+++ b/src/core/transform/ojph_transform.h
@@ -42,8 +42,13 @@
 #include "ojph_defs.h"
 
 namespace ojph {
-  struct line_buf;
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
+    union lifting_step;
+    struct param_atk;
 
     //////////////////////////////////////////////////////////////////////////
     void init_wavelet_transform_functions();
@@ -53,54 +58,42 @@ namespace ojph {
     /////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_vert_wvlt_fwd_predict)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_vert_wvlt_fwd_update)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_horz_wvlt_fwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_vert_wvlt_bwd_predict)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat);
+    extern void (*rev_vert_step)
+      (const lifting_step* s, const line_buf* sig, const line_buf* other,
+        const line_buf* aug, ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_vert_wvlt_bwd_update)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat);
+    extern void (*rev_horz_ana)
+      (const param_atk* atk, const line_buf* ldst, const line_buf* hdst,
+        const line_buf* src, ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_horz_wvlt_bwd_tx)
-      (line_buf* dst, line_buf *lsrc, line_buf *hsrc, ui32 width, bool even);
+    extern void (*rev_horz_syn)
+      (const param_atk* atk, const line_buf* dst, const line_buf* lsrc,
+        const line_buf* hsrc, ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     /////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    extern void (*irrev_vert_wvlt_step)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       int step_num, ui32 repeat);
+    extern void (*irv_vert_step)
+      (const lifting_step* s, const line_buf* sig, const line_buf* other, 
+        const line_buf* aug, ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
-    extern void (*irrev_vert_wvlt_K)
-      (const line_buf *src, line_buf *dst, bool L_analysis_or_H_synthesis,
-       ui32 repeat);
+    extern void (*irv_vert_times_K)
+      (float K, const line_buf* aug, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
-    extern void (*irrev_horz_wvlt_fwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even);
+    extern void (*irv_horz_ana)
+      (const param_atk* atk, const line_buf* ldst, const line_buf* hdst, 
+        const line_buf* src, ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
-    extern void (*irrev_horz_wvlt_bwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even);
+    extern void (*irv_horz_syn)
+      (const param_atk* atk, const line_buf* dst, const line_buf* lsrc, 
+        const line_buf* hsrc, ui32 width, bool even);
 
   }
 }
diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp
index 725d7ce8..353b08f2 100644
--- a/src/core/transform/ojph_transform_avx.cpp
+++ b/src/core/transform/ojph_transform_avx.cpp
@@ -35,30 +35,83 @@
 // Date: 28 August 2019
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
 #include <cstdio>
+#include <immintrin.h>
 
 #include "ojph_defs.h"
-#include "ojph_arch.h"
 #include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
-#include <immintrin.h>
-
 namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void avx_irrev_vert_wvlt_step(const line_buf* line_src1,
-                                  const line_buf* line_src2,
-                                  line_buf *line_dst, int step_num,
-                                  ui32 repeat)
+    static inline void avx_multiply_const(float* p, float f, int width)
     {
-      float *dst = line_dst->f32;
-      const float *src1 = line_src1->f32, *src2 = line_src2->f32;
-    
-      __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[step_num]);
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
+      __m256 factor = _mm256_set1_ps(f);
+      for (; width > 0; width -= 8, p += 8)
+      {
+        __m256 s = _mm256_load_ps(p);
+        _mm256_store_ps(p, _mm256_mul_ps(factor, s));
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void avx_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
+      {
+        __m256 a = _mm256_load_ps(sp);
+        __m256 b = _mm256_load_ps(sp + 8);
+        __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+        __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+        __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+        __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm256_store_ps(dpl, e);
+        _mm256_store_ps(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void avx_interleave32(float* dp, float* spl, float* sph, int width)
+    {
+      for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
+      {
+        __m256 a = _mm256_load_ps(spl);
+        __m256 b = _mm256_load_ps(sph);
+        __m256 c = _mm256_unpacklo_ps(a, b);
+        __m256 d = _mm256_unpackhi_ps(a, b);
+        __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+        __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+        _mm256_store_ps(dp, e);
+        _mm256_store_ps(dp + 8, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis)
+    {
+      float a = s->irv.Aatk;
+      if (synthesis)
+        a = -a;
+
+      __m256 factor = _mm256_set1_ps(a);
+
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
       {
         __m256 s1 = _mm256_load_ps(src1);
         __m256 s2 = _mm256_load_ps(src2);
@@ -68,262 +121,171 @@ namespace ojph {
       }
     }
 
-    /////////////////////////////////////////////////////////////////////////
-    void avx_irrev_vert_wvlt_K(const line_buf* line_src, line_buf* line_dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat)
+    //////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
     {
-      float *dst = line_dst->f32;
-      const float *src = line_src->f32;
-
-      float f = LIFTING_FACTORS::K_inv;
-      f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
-      __m256 factor = _mm256_set1_ps(f);
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src+=8)
-      {
-        __m256 s = _mm256_load_ps(src);
-        _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
-      }
+      avx_multiply_const(aug->f32, K, (int)repeat);
     }
 
-
     /////////////////////////////////////////////////////////////////////////
-    void avx_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
-                                    line_buf *line_hdst, ui32 width,
-                                    bool even)
+    void avx_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *src = line_src->f32;
-        float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const float* sp = src + (even ? 1 : 0);
-        float *dph = hdst;
-        __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[0]);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          __m256 s1 = _mm256_loadu_ps(sp - 1);
-          __m256 s2 = _mm256_loadu_ps(sp + 1);
-          __m256 d = _mm256_loadu_ps(sp);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          __m256 d1 = _mm256_add_ps(d, s1);
-          sp += 8;
-          __m128 t1 = _mm256_extractf128_ps(d1, 0);
-          __m128 t2 = _mm256_extractf128_ps(d1, 1);
-          __m128 t = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
-          _mm_store_ps(dph, t);
-          dph += 4;
-        }
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        __m128 factor128 = _mm_set1_ps(LIFTING_FACTORS::steps[1]);
-        sp = src + (even ? 0 : 1);
-        const float* sph = hdst + (even ? 0 : 1);
-        float *dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
+        // split src into ldst and hdst
         {
-          __m256 d1 = _mm256_loadu_ps(sp); //is there an advantage here?
-          __m128 t1 = _mm256_extractf128_ps(d1, 0);
-          __m128 t2 = _mm256_extractf128_ps(d1, 1);
-          __m128 d = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
-
-          __m128 s1 = _mm_loadu_ps(sph - 1);
-          __m128 s2 = _mm_loadu_ps(sph);
-          s1 = _mm_mul_ps(factor128, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dpl, d);
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          avx_deinterleave32(dpl, dph, sp, w);
         }
 
-        //extension
-        ldst[-1] = ldst[0];
-        ldst[L_width] = ldst[L_width-1];
-        //predict
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[2]);
-        const float* spl = ldst + (even ? 1 : 0);
-        dph = hdst;
-        for (ui32 i = (H_width + 7) >> 3; i > 0; --i, spl+=8, dph+=8)
+        // the actual horizontal transform
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          __m256 s1 = _mm256_loadu_ps(spl - 1);
-          __m256 s2 = _mm256_loadu_ps(spl);
-          __m256 d = _mm256_loadu_ps(dph);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s1);
-          _mm256_store_ps(dph, d);
-        }
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[3]);
-        sph = hdst + (even ? 0 : 1);
-        dpl = ldst;
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
-        {
-          __m256 s1 = _mm256_loadu_ps(sph - 1);
-          __m256 s2 = _mm256_loadu_ps(sph);
-          __m256 d = _mm256_loadu_ps(dpl);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s1);
-          _mm256_store_ps(dpl, d);
-        }
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp;
+          float* dp = hp;
+          int i = (int)h_width;
+          __m256 f = _mm256_set1_ps(a);
+          if (even)
+          {
+            for (; i > 0; i -= 8, sp += 8, dp += 8)
+            {
+              __m256 m = _mm256_load_ps(sp);
+              __m256 n = _mm256_loadu_ps(sp + 1);
+              __m256 p = _mm256_load_ps(dp);
+              p = _mm256_add_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
+              _mm256_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 8, sp += 8, dp += 8)
+            {
+              __m256 m = _mm256_load_ps(sp);
+              __m256 n = _mm256_loadu_ps(sp - 1);
+              __m256 p = _mm256_load_ps(dp);
+              p = _mm256_add_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
+              _mm256_store_ps(dp, p);
+            }
+          }
 
-        //multipliers
-        float *dp = ldst;
-        factor = _mm256_set1_ps(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
-        {
-          __m256 d = _mm256_load_ps(dp);
-          _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
-        dp = hdst;
-        factor = _mm256_set1_ps(LIFTING_FACTORS::K);
-        for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
-        {
-          __m256 d = _mm256_load_ps(dp);
-          _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          avx_multiply_const(lp, K_inv, (int)l_width);
+          avx_multiply_const(hp, K, (int)h_width);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->f32[0] = line_src->f32[0];
+          ldst->f32[0] = src->f32[0];
         else
-          line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
+          hdst->f32[0] = src->f32[0] * 2.0f;
       }
     }
-
-    /////////////////////////////////////////////////////////////////////////
-    void avx_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
-                                    line_buf *line_hsrc, ui32 width,
-                                    bool even)
+    
+    //////////////////////////////////////////////////////////////////////////
+    void avx_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
-        float *dst = line_dst->f32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
 
-        //multipliers
-        float *dp = lsrc;
-        __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::K);
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
-        {
-          __m256 d = _mm256_load_ps(dp);
-          _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
-        }
-        dp = hsrc;
-        factor = _mm256_set1_ps(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
-        {
-          __m256 d = _mm256_load_ps(dp);
-          _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          avx_multiply_const(aug, K, (int)aug_width);
+          avx_multiply_const(oth, K_inv, (int)oth_width);
         }
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[7]);
-        const float *sph = hsrc + (even ? 0 : 1);
-        float *dpl = lsrc;
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
+        // the actual horizontal transform
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          __m256 s1 = _mm256_loadu_ps(sph - 1);
-          __m256 s2 = _mm256_loadu_ps(sph);
-          __m256 d = _mm256_loadu_ps(dpl);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s1);
-          _mm256_store_ps(dpl, d);
-        }
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[6]);
-        const float *spl = lsrc + (even ? 0 : -1);
-        float *dph = hsrc;
-        for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dph+=8, spl+=8)
-        {
-          __m256 s1 = _mm256_loadu_ps(spl);
-          __m256 s2 = _mm256_loadu_ps(spl + 1);
-          __m256 d = _mm256_loadu_ps(dph);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s1);
-          _mm256_store_ps(dph, d);
-        }
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth;
+          float* dp = aug;
+          int i = (int)aug_width;
+          __m256 f = _mm256_set1_ps(a);
+          if (ev)
+          {
+            for (; i > 0; i -= 8, sp += 8, dp += 8)
+            {
+              __m256 m = _mm256_load_ps(sp);
+              __m256 n = _mm256_loadu_ps(sp - 1);
+              __m256 p = _mm256_load_ps(dp);
+              p = _mm256_sub_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
+              _mm256_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 8, sp += 8, dp += 8)
+            {
+              __m256 m = _mm256_load_ps(sp);
+              __m256 n = _mm256_loadu_ps(sp + 1);
+              __m256 p = _mm256_load_ps(dp);
+              p = _mm256_sub_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
+              _mm256_store_ps(dp, p);
+            }
+          }
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[5]);
-        sph = hsrc + (even ? 0 : 1);
-        dpl = lsrc;
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dpl+=8, sph+=8)
-        {
-          __m256 s1 = _mm256_loadu_ps(sph - 1);
-          __m256 s2 = _mm256_loadu_ps(sph);
-          __m256 d = _mm256_loadu_ps(dpl);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s1);
-          _mm256_store_ps(dpl, d);
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict and combine
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[4]);
-        dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 7) >> 3; i > 0; --i, spl+=8, sph+=8)
+        // combine both lsrc and hsrc into dst
         {
-          __m256 s1 = _mm256_loadu_ps(spl);
-          __m256 s2 = _mm256_loadu_ps(spl + 1);
-          __m256 d = _mm256_load_ps(sph);
-          s2 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s2);
-
-          __m128 a0 = _mm256_extractf128_ps(s1, 0);
-          __m128 a1 = _mm256_extractf128_ps(s1, 1);
-          __m128 a2 = _mm256_extractf128_ps(d, 0);
-          __m128 a3 = _mm256_extractf128_ps(d, 1);
-          _mm_storeu_ps(dp, _mm_unpacklo_ps(a0, a2)); dp += 4;
-          _mm_storeu_ps(dp, _mm_unpackhi_ps(a0, a2)); dp += 4;
-          _mm_storeu_ps(dp, _mm_unpacklo_ps(a1, a3)); dp += 4;
-          _mm_storeu_ps(dp, _mm_unpackhi_ps(a1, a3)); dp += 4;
-
-//          s2 = _mm256_unpackhi_ps(s1, d);
-//          s1 = _mm256_unpacklo_ps(s1, d);
-//          d = _mm256_permute2f128_ps(s1, s2, (2 << 4) | 0);
-//          _mm256_storeu_ps(dp, d);
-//          d = _mm256_permute2f128_ps(s1, s2, (3 << 4) | 1);
-//          _mm256_storeu_ps(dp + 1, d);
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          avx_interleave32(dp, spl, sph, w);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->f32[0] = line_lsrc->f32[0];
+          dst->f32[0] = lsrc->f32[0];
         else
-          line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
       }
     }
-  }
-}
+
+  } // !local
+} // !ojph
+
+#endif
diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp
index 915e246c..51e59877 100644
--- a/src/core/transform/ojph_transform_avx2.cpp
+++ b/src/core/transform/ojph_transform_avx2.cpp
@@ -35,11 +35,17 @@
 // Date: 28 August 2019
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
+#include <climits>
 #include <cstdio>
 
 #include "ojph_defs.h"
-#include "ojph_arch.h"
 #include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
@@ -48,218 +54,1026 @@
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_fwd_predict(const line_buf* line_src1,
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline 
+    __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) 
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
+      // note than m must be obtained using
+      // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
+      __m256i x = _mm256_srli_epi64(a, amt);
+      x = _mm256_xor_si256(x, m);
+      __m256i result = _mm256_sub_epi64(x, m);
+      return result;
+    }
 
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void avx2_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
       {
-        __m256i s1 = _mm256_load_si256((__m256i*)src1);
-        __m256i s2 = _mm256_load_si256((__m256i*)src2);
-        __m256i d = _mm256_load_si256((__m256i*)dst);
-        s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
-        d = _mm256_sub_epi32(d, s1);
-        _mm256_store_si256((__m256i*)dst, d);
+        __m256 a = _mm256_load_ps(sp);
+        __m256 b = _mm256_load_ps(sp + 8);
+        __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+        __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+        __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+        __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm256_store_ps(dpl, e);
+        _mm256_store_ps(dph, f);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_fwd_update(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    static inline 
+    void avx2_interleave32(float* dp, float* spl, float* sph, int width)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
+      for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
+      {
+        __m256 a = _mm256_load_ps(spl);
+        __m256 b = _mm256_load_ps(sph);
+        __m256 c = _mm256_unpacklo_ps(a, b);
+        __m256 d = _mm256_unpackhi_ps(a, b);
+        __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+        __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+        _mm256_store_ps(dp, e);
+        _mm256_store_ps(dp + 8, f);
+      }
+    }
 
-      __m256i offset = _mm256_set1_epi32(2);
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void avx2_deinterleave64(double* dpl, double* dph, double* sp, int width)
+    {
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
       {
-        __m256i s1 = _mm256_load_si256((__m256i*)src1);
-        s1 = _mm256_add_epi32(s1, offset);
-        __m256i s2 = _mm256_load_si256((__m256i*)src2);
-        s2 = _mm256_add_epi32(s2, s1);
-        __m256i d = _mm256_load_si256((__m256i*)dst);
-        d = _mm256_add_epi32(d, _mm256_srai_epi32(s2, 2));
-        _mm256_store_si256((__m256i*)dst, d);
+        __m256d a = _mm256_load_pd(sp);
+        __m256d b = _mm256_load_pd(sp + 4);
+        __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0));
+        __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1));
+        __m256d e = _mm256_shuffle_pd(c, d, 0x0);
+        __m256d f = _mm256_shuffle_pd(c, d, 0xF);
+        _mm256_store_pd(dpl, e);
+        _mm256_store_pd(dph, f);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_wvlt_fwd_tx(line_buf* line_src, line_buf *line_ldst,
-                                   line_buf *line_hdst,ui32 width, bool even)
+    static inline 
+    void avx2_interleave64(double* dp, double* spl, double* sph, int width)
+    {
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m256d a = _mm256_load_pd(spl);
+        __m256d b = _mm256_load_pd(sph);
+        __m256d c = _mm256_unpacklo_pd(a, b);
+        __m256d d = _mm256_unpackhi_pd(a, b);
+        __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0));
+        __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1));
+        _mm256_store_pd(dp, e);
+        _mm256_store_pd(dp + 4, f);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m256i va = _mm256_set1_epi32(a);
+      __m256i vb = _mm256_set1_epi32(b);
+
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i v = _mm256_add_epi32(vb, t);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_sub_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i v = _mm256_add_epi32(vb, t);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_add_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i w = _mm256_srai_epi32(t, e);
+            d = _mm256_add_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i w = _mm256_srai_epi32(t, e);
+            d = _mm256_sub_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i v = _mm256_sub_epi32(vb, t);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_sub_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i v = _mm256_sub_epi32(vb, t);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_add_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else { // general case
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i u = _mm256_mullo_epi32(va, t);
+            __m256i v = _mm256_add_epi32(vb, u);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_sub_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i u = _mm256_mullo_epi32(va, t);
+            __m256i v = _mm256_add_epi32(vb, u);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_add_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m256i vb = _mm256_set1_epi64x(b);
+      __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));      
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_add_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_sub_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_add_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_add_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+            d = _mm256_add_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+            d = _mm256_sub_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_sub_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_sub_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_sub_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_add_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else { // general case
+        // 64bit multiplication is not supported in avx2;
+        // in particular, _mm256_mullo_epi64.
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        avx2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        avx2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *src = line_src->i32;
-        si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const si32* sp = src + (even ? 1 : 0);
-        si32 *dph = hdst;
-        const __m256i mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
-        for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dph+=8)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          __m256i s1 = _mm256_loadu_si256((__m256i*)(sp-1));
-          __m256i s2 = _mm256_loadu_si256((__m256i*)(sp+1));
-          __m256i d = _mm256_loadu_si256((__m256i*)sp);
-          s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
-          __m256i d1 = _mm256_sub_epi32(d, s1);
-          sp += 8;
-          s1 = _mm256_loadu_si256((__m256i*)(sp-1));
-          s2 = _mm256_loadu_si256((__m256i*)(sp+1));
-          d = _mm256_loadu_si256((__m256i*)sp);
-          s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
-          __m256i d2 = _mm256_sub_epi32(d, s1);
-          sp += 8;
-          d1 = _mm256_permutevar8x32_epi32(d1, mask);
-          d2 = _mm256_permutevar8x32_epi32(d2, mask);
-          d = _mm256_permute2x128_si256(d1, d2, (2 << 4) | 0);
-          _mm256_store_si256((__m256i*)dph, d);
+        // split src into ldst and hdst
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp  = src->f32;
+          int w = (int)width;
+          avx2_deinterleave32(dpl, dph, sp, w);
         }
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        sp = src + (even ? 0 : 1);
-        const si32* sph = hdst + (even ? 0 : 1);
-        si32 *dpl = ldst;
-        __m256i offset = _mm256_set1_epi32(2);
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sp+=16, sph+=8, dpl+=8)
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          __m256i s1 = _mm256_loadu_si256((__m256i*)(sph-1));
-          s1 = _mm256_add_epi32(s1, offset);
-          __m256i s2 = _mm256_loadu_si256((__m256i*)sph);
-          s2 = _mm256_add_epi32(s2, s1);
-          __m256i d1 = _mm256_loadu_si256((__m256i*)sp);
-          __m256i d2 = _mm256_loadu_si256((__m256i*)sp + 1);
-          d1 = _mm256_permutevar8x32_epi32(d1, mask);
-          d2 = _mm256_permutevar8x32_epi32(d2, mask);
-          __m256i d = _mm256_permute2x128_si256(d1, d2, (2 << 4) | 0);
-          d = _mm256_add_epi32(d, _mm256_srai_epi32(s2, 2));
-          _mm256_store_si256((__m256i*)dpl, d);
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e  = s->rev.Eatk;
+          __m256i va = _mm256_set1_epi32(a);
+          __m256i vb = _mm256_set1_epi32(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp;
+          si32* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_add_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_add_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i w = _mm256_srai_epi32(t, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i w = _mm256_srai_epi32(t, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_sub_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_sub_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i u = _mm256_mullo_epi32(va, t);
+                __m256i v = _mm256_add_epi32(vb, u);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i u = _mm256_mullo_epi32(va, t);
+                __m256i v = _mm256_add_epi32(vb, u);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->i32[0] = line_src->i32[0];
+          ldst->i32[0] = src->i32[0];
         else
-          line_hdst->i32[0] = line_src->i32[0] << 1;
+          hdst->i32[0] = src->i32[0] << 1;
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_bwd_predict(const line_buf* line_src1,
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
+      if (width > 1)
       {
-        __m256i s1 = _mm256_load_si256((__m256i*)src1);
-        __m256i s2 = _mm256_load_si256((__m256i*)src2);
-        __m256i d = _mm256_load_si256((__m256i*)dst);
-        s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
-        d = _mm256_add_epi32(d, s1);
-        _mm256_store_si256((__m256i*)dst, d);
+        // split src into ldst and hdst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)src->p;
+          int w = (int)width;
+          avx2_deinterleave64(dpl, dph, sp, w);
+        }
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e  = s->rev.Eatk;
+          __m256i vb = _mm256_set1_epi64x(b);
+          __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in avx2;
+            // in particular, _mm256_mullo_epi64.
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_bwd_update(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        avx2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        avx2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    } 
     
-      __m256i offset = _mm256_set1_epi32(2);
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
+    //////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
       {
-        __m256i s1 = _mm256_load_si256((__m256i*)src1);
-        s1 = _mm256_add_epi32(s1, offset);
-        __m256i s2 = _mm256_load_si256((__m256i*)src2);
-        s2 = _mm256_add_epi32(s2, s1);
-        __m256i d = _mm256_load_si256((__m256i*)dst);
-        d = _mm256_sub_epi32(d, _mm256_srai_epi32(s2, 2));
-        _mm256_store_si256((__m256i*)dst, d);
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e  = s->rev.Eatk;
+          __m256i va = _mm256_set1_epi32(a);
+          __m256i vb = _mm256_set1_epi32(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth;
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_add_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_add_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i w = _mm256_srai_epi32(t, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i w = _mm256_srai_epi32(t, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_sub_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_sub_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i u = _mm256_mullo_epi32(va, t);
+                __m256i v = _mm256_add_epi32(vb, u);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i u = _mm256_mullo_epi32(va, t);
+                __m256i v = _mm256_add_epi32(vb, u);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          float* dp  = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          avx2_interleave32(dp, spl, sph, w);
+        }
+      }
+      else {
+        if (even)
+          dst->i32[0] = lsrc->i32[0];
+        else
+          dst->i32[0] = hsrc->i32[0] >> 1;
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
-                                   line_buf *line_hsrc, ui32 width, bool even)
+    static
+    void avx2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
-        si32 *dst = line_dst->i32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        const si32 *sph = hsrc + (even ? 0 : 1);
-        si32 *spl = lsrc;
-        __m256i offset = _mm256_set1_epi32(2);
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, spl+=8)
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          __m256i s1 = _mm256_loadu_si256((__m256i*)(sph-1));
-          s1 = _mm256_add_epi32(s1, offset);
-          __m256i s2 = _mm256_loadu_si256((__m256i*)sph);
-          s2 = _mm256_add_epi32(s2, s1);
-          __m256i d = _mm256_load_si256((__m256i*)spl);
-          d = _mm256_sub_epi32(d, _mm256_srai_epi32(s2, 2));
-          _mm256_store_si256((__m256i*)spl, d);
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e  = s->rev.Eatk;
+          __m256i vb = _mm256_set1_epi64x(b);
+          __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));      
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in avx2;
+            // in particular, _mm_mullo_epi64.
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        // extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width - 1];
-        // inverse predict and combine
-        si32 *dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 7) >> 3; i > 0; --i, sph+=8, spl+=8, dp+=16)
+        // combine both lsrc and hsrc into dst
         {
-          __m256i s1 = _mm256_loadu_si256((__m256i*)spl);
-          __m256i s2 = _mm256_loadu_si256((__m256i*)(spl+1));
-          __m256i d = _mm256_load_si256((__m256i*)sph);
-          s2 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
-          d = _mm256_add_epi32(d, s2);
-          s2 = _mm256_unpackhi_epi32(s1, d);
-          s1 = _mm256_unpacklo_epi32(s1, d);
-          d = _mm256_permute2x128_si256(s1, s2, (2 << 4) | 0);
-          _mm256_storeu_si256((__m256i*)dp, d);
-          d = _mm256_permute2x128_si256(s1, s2, (3 << 4) | 1);
-          _mm256_storeu_si256((__m256i*)dp + 1, d);
+          double* dp  = (double*)dst->p;
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          avx2_interleave64(dp, spl, sph, w);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->i32[0] = line_lsrc->i32[0];
+          dst->i64[0] = lsrc->i64[0];
         else
-          line_dst->i32[0] = line_hsrc->i32[0] >> 1;
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }    
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        avx2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        avx2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
       }
     }
-  }
-}
+
+  } // !local
+} // !ojph
+
+#endif
diff --git a/src/core/transform/ojph_transform_avx512.cpp b/src/core/transform/ojph_transform_avx512.cpp
new file mode 100644
index 00000000..36276e48
--- /dev/null
+++ b/src/core/transform/ojph_transform_avx512.cpp
@@ -0,0 +1,1429 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019-2024, Aous Naman 
+// Copyright (c) 2019-2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019-2024, The University of New South Wales, Australia
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_transform_avx512.cpp
+// Author: Aous Naman
+// Date: 13 April 2024
+//***************************************************************************/
+
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_X86_64)
+
+#include <cstdio>
+
+#include "ojph_defs.h"
+#include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
+#include "ojph_transform.h"
+#include "ojph_transform_local.h"
+
+#include <immintrin.h>
+
+namespace ojph {
+  namespace local {
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 32 followed by multiples of 16, because
+    // we assume byte_alignment == 64
+    static 
+    void avx512_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      __m512i idx1 = _mm512_set_epi32(
+        0x1E, 0x1C, 0x1A, 0x18, 0x16, 0x14, 0x12, 0x10,
+        0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
+      );
+      __m512i idx2 = _mm512_set_epi32(
+        0x1F, 0x1D, 0x1B, 0x19, 0x17, 0x15, 0x13, 0x11,
+        0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
+      );
+      for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
+      {
+        __m512 a = _mm512_load_ps(sp);
+        __m512 b = _mm512_load_ps(sp + 16);
+        __m512 c = _mm512_permutex2var_ps(a, idx1, b);
+        __m512 d = _mm512_permutex2var_ps(a, idx2, b);
+        _mm512_store_ps(dpl, c);
+        _mm512_store_ps(dph, d);
+      }
+      for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
+      {
+        __m256 a = _mm256_load_ps(sp);
+        __m256 b = _mm256_load_ps(sp + 8);
+        __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+        __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+        __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+        __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm256_store_ps(dpl, e);
+        _mm256_store_ps(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 32 followed by multiples of 16, because
+    // we assume byte_alignment == 64
+    static 
+    void avx512_interleave32(float* dp, float* spl, float* sph, int width)
+    {
+      __m512i idx1 = _mm512_set_epi32(
+        0x17, 0x7, 0x16, 0x6, 0x15, 0x5, 0x14, 0x4,
+        0x13, 0x3, 0x12, 0x2, 0x11, 0x1, 0x10, 0x0
+      );
+      __m512i idx2 = _mm512_set_epi32(
+        0x1F, 0xF, 0x1E, 0xE, 0x1D, 0xD, 0x1C, 0xC,
+        0x1B, 0xB, 0x1A, 0xA, 0x19, 0x9, 0x18, 0x8
+      );
+      for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
+      {
+        __m512 a = _mm512_load_ps(spl);
+        __m512 b = _mm512_load_ps(sph);
+        __m512 c = _mm512_permutex2var_ps(a, idx1, b);
+        __m512 d = _mm512_permutex2var_ps(a, idx2, b);
+        _mm512_store_ps(dp, c);
+        _mm512_store_ps(dp + 16, d);
+      }
+      for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
+      {
+        __m256 a = _mm256_load_ps(spl);
+        __m256 b = _mm256_load_ps(sph);
+        __m256 c = _mm256_unpacklo_ps(a, b);
+        __m256 d = _mm256_unpackhi_ps(a, b);
+        __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+        __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+        _mm256_store_ps(dp, e);
+        _mm256_store_ps(dp + 8, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 32 followed by multiples of 16, because
+    // we assume byte_alignment == 64
+    static void avx512_deinterleave64(double* dpl, double* dph, double* sp, 
+                                      int width)
+    {
+      __m512i idx1 = _mm512_set_epi64(
+        0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
+      );
+      __m512i idx2 = _mm512_set_epi64(
+        0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
+      );
+      for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8)
+      {
+        __m512d a = _mm512_load_pd(sp);
+        __m512d b = _mm512_load_pd(sp + 16);
+        __m512d c = _mm512_permutex2var_pd(a, idx1, b);
+        __m512d d = _mm512_permutex2var_pd(a, idx2, b);
+        _mm512_store_pd(dpl, c);
+        _mm512_store_pd(dph, d);
+      }
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        __m256d a = _mm256_load_pd(sp);
+        __m256d b = _mm256_load_pd(sp + 4);
+        __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0));
+        __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1));
+        __m256d e = _mm256_shuffle_pd(c, d, 0x0);
+        __m256d f = _mm256_shuffle_pd(c, d, 0xF);
+        _mm256_store_pd(dpl, e);
+        _mm256_store_pd(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 32 followed by multiples of 16, because
+    // we assume byte_alignment == 64
+    static void avx512_interleave64(double* dp, double* spl, double* sph, 
+                                    int width)
+    {
+      __m512i idx1 = _mm512_set_epi64(
+        0xB, 0x3, 0xA, 0x2, 0x9, 0x1, 0x8, 0x0
+      );
+      __m512i idx2 = _mm512_set_epi64(
+        0xF, 0x7, 0xE, 0x6, 0xD, 0x5, 0xC, 0x4
+      );
+      for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8)
+      {
+        __m512d a = _mm512_load_pd(spl);
+        __m512d b = _mm512_load_pd(sph);
+        __m512d c = _mm512_permutex2var_pd(a, idx1, b);
+        __m512d d = _mm512_permutex2var_pd(a, idx2, b);
+        _mm512_store_pd(dp, c);
+        _mm512_store_pd(dp + 16, d);
+      }
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m256d a = _mm256_load_pd(spl);
+        __m256d b = _mm256_load_pd(sph);
+        __m256d c = _mm256_unpacklo_pd(a, b);
+        __m256d d = _mm256_unpackhi_pd(a, b);
+        __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0));
+        __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1));
+        _mm256_store_pd(dp, e);
+        _mm256_store_pd(dp + 4, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void avx512_multiply_const(float* p, float f, int width)
+    {
+      __m512 factor = _mm512_set1_ps(f);
+      for (; width > 0; width -= 16, p += 16)
+      {
+        __m512 s = _mm512_load_ps(p);
+        _mm512_store_ps(p, _mm512_mul_ps(factor, s));
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      float a = s->irv.Aatk;
+      if (synthesis)
+        a = -a;
+
+      __m512 factor = _mm512_set1_ps(a);
+
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+      {
+        __m512 s1 = _mm512_load_ps(src1);
+        __m512 s2 = _mm512_load_ps(src2);
+        __m512 d = _mm512_load_ps(dst);
+        d = _mm512_add_ps(d, _mm512_mul_ps(factor, _mm512_add_ps(s1, s2)));
+        _mm512_store_ps(dst, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      avx512_multiply_const(aug->f32, K, (int)repeat);
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp  = src->f32;
+          int w = (int)width;
+          avx512_deinterleave32(dpl, dph, sp, w);
+        }
+
+        // the actual horizontal transform
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp;
+          float* dp = hp;
+          int i = (int)h_width;
+          __m512 f = _mm512_set1_ps(a);
+          if (even)
+          {
+            for (; i > 0; i -= 16, sp += 16, dp += 16)
+            {
+              __m512 m = _mm512_load_ps(sp);
+              __m512 n = _mm512_loadu_ps(sp + 1);
+              __m512 p = _mm512_load_ps(dp);
+              p = _mm512_add_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
+              _mm512_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 16, sp += 16, dp += 16)
+            {
+              __m512 m = _mm512_load_ps(sp);
+              __m512 n = _mm512_loadu_ps(sp - 1);
+              __m512 p = _mm512_load_ps(dp);
+              p = _mm512_add_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
+              _mm512_store_ps(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          avx512_multiply_const(lp, K_inv, (int)l_width);
+          avx512_multiply_const(hp, K, (int)h_width);
+        }
+      }
+      else {
+        if (even)
+          ldst->f32[0] = src->f32[0];
+        else
+          hdst->f32[0] = src->f32[0] * 2.0f;
+      }
+    }
+    
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          avx512_multiply_const(aug, K, (int)aug_width);
+          avx512_multiply_const(oth, K_inv, (int)oth_width);
+        }
+
+        // the actual horizontal transform
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth;
+          float* dp = aug;
+          int i = (int)aug_width;
+          __m512 f = _mm512_set1_ps(a);
+          if (ev)
+          {
+            for (; i > 0; i -= 16, sp += 16, dp += 16)
+            {
+              __m512 m = _mm512_load_ps(sp);
+              __m512 n = _mm512_loadu_ps(sp - 1);
+              __m512 p = _mm512_load_ps(dp);
+              p = _mm512_sub_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
+              _mm512_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 16, sp += 16, dp += 16)
+            {
+              __m512 m = _mm512_load_ps(sp);
+              __m512 n = _mm512_loadu_ps(sp + 1);
+              __m512 p = _mm512_load_ps(dp);
+              p = _mm512_sub_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
+              _mm512_store_ps(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          float* dp  = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          avx512_interleave32(dp, spl, sph, w);
+        }        
+      }
+      else {
+        if (even)
+          dst->f32[0] = lsrc->f32[0];
+        else
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
+      }
+    }
+
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                                const line_buf* other, const line_buf* aug, 
+                                ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m512i va = _mm512_set1_epi32(a);
+      __m512i vb = _mm512_set1_epi32(b);
+
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i v = _mm512_add_epi32(vb, t);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_sub_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i v = _mm512_add_epi32(vb, t);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_add_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i w = _mm512_srai_epi32(t, e);
+            d = _mm512_add_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i w = _mm512_srai_epi32(t, e);
+            d = _mm512_sub_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i v = _mm512_sub_epi32(vb, t);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_sub_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i v = _mm512_sub_epi32(vb, t);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_add_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else { // general case
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i u = _mm512_mullo_epi32(va, t);
+            __m512i v = _mm512_add_epi32(vb, u);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_sub_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i u = _mm512_mullo_epi32(va, t);
+            __m512i v = _mm512_add_epi32(vb, u);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_add_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                                const line_buf* other, const line_buf* aug, 
+                                ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m512i vb = _mm512_set1_epi64(b);
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_add_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_sub_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_add_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_add_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i w = _mm512_srai_epi64(t, e);
+            d = _mm512_add_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i w = _mm512_srai_epi64(t, e);
+            d = _mm512_sub_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_sub_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_sub_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_sub_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_add_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else { 
+        // general case
+        // 64bit multiplication is not supported in AVX512F + AVX512CD;
+        // in particular, _mm256_mullo_epi64.
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+
+      // This can only be used if you have AVX512DQ
+      // { // general case
+      //   __m512i va = _mm512_set1_epi64(a);
+      //   int i = (int)repeat;
+      //   if (synthesis)
+      //     for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+      //     {
+      //       __m512i s1 = _mm512_load_si512((__m512i*)src1);
+      //       __m512i s2 = _mm512_load_si512((__m512i*)src2);
+      //       __m512i d = _mm512_load_si512((__m512i*)dst);
+      //       __m512i t = _mm512_add_epi64(s1, s2);
+      //       __m512i u = _mm512_mullo_epi64(va, t);
+      //       __m512i v = _mm512_add_epi64(vb, u);
+      //       __m512i w = _mm512_srai_epi64(v, e);
+      //       d = _mm512_sub_epi64(d, w);
+      //       _mm512_store_si512((__m512i*)dst, d);
+      //     }
+      //   else
+      //     for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+      //     {
+      //       __m512i s1 = _mm512_load_si512((__m512i*)src1);
+      //       __m512i s2 = _mm512_load_si512((__m512i*)src2);
+      //       __m512i d = _mm512_load_si512((__m512i*)dst);
+      //       __m512i t = _mm512_add_epi64(s1, s2);
+      //       __m512i u = _mm512_mullo_epi64(va, t);
+      //       __m512i v = _mm512_add_epi64(vb, u);
+      //       __m512i w = _mm512_srai_epi64(v, e);
+      //       d = _mm512_add_epi64(d, w);
+      //       _mm512_store_si512((__m512i*)dst, d);
+      //     }
+      // }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        avx512_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        avx512_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                               const line_buf* hdst, const line_buf* src, 
+                               ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp  = src->f32;
+          int w = (int)width;
+          avx512_deinterleave32(dpl, dph, sp, w);
+        }        
+
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m512i va = _mm512_set1_epi32(a);
+          __m512i vb = _mm512_set1_epi32(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp;
+          si32* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_add_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_add_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i w = _mm512_srai_epi32(t, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i w = _mm512_srai_epi32(t, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_sub_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_sub_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i u = _mm512_mullo_epi32(va, t);
+                __m512i v = _mm512_add_epi32(vb, u);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i u = _mm512_mullo_epi32(va, t);
+                __m512i v = _mm512_add_epi32(vb, u);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i32[0] = src->i32[0];
+        else
+          hdst->i32[0] = src->i32[0] << 1;
+      }
+    }
+    
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                               const line_buf* hdst, const line_buf* src, 
+                               ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)(src->p);
+          int w = (int)width;
+          avx512_deinterleave64(dpl, dph, sp, w);
+        }        
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m512i vb = _mm512_set1_epi64(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else 
+          {
+            // general case
+            // 64bit multiplication is not supported in AVX512F + AVX512CD;
+            // in particular, _mm256_mullo_epi64.
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // This can only be used if you have AVX512DQ
+          // {
+          //   // general case
+          //   __m512i va = _mm512_set1_epi64(a);
+          //   int i = (int)h_width;
+          //   if (even)
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_add_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          //   else
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_add_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          // }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        avx512_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        avx512_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    } 
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                               const line_buf* lsrc, const line_buf* hsrc, 
+                               ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m512i va = _mm512_set1_epi32(a);
+          __m512i vb = _mm512_set1_epi32(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth;
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_add_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_add_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i w = _mm512_srai_epi32(t, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i w = _mm512_srai_epi32(t, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_sub_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_sub_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i u = _mm512_mullo_epi32(va, t);
+                __m512i v = _mm512_add_epi32(vb, u);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i u = _mm512_mullo_epi32(va, t);
+                __m512i v = _mm512_add_epi32(vb, u);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          float* dp  = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          avx512_interleave32(dp, spl, sph, w);
+        }          
+      }
+      else {
+        if (even)
+          dst->i32[0] = lsrc->i32[0];
+        else
+          dst->i32[0] = hsrc->i32[0] >> 1;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                               const line_buf* lsrc, const line_buf* hsrc, 
+                               ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m512i vb = _mm512_set1_epi64(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else 
+           {
+            // general case
+            // 64bit multiplication is not supported in AVX512F + AVX512CD;
+            // in particular, _mm256_mullo_epi64.            
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
+
+          // This can only be used if you have AVX512DQ
+          // {
+          //   // general case
+          //   __m512i va = _mm512_set1_epi64(a);
+          //   int i = (int)aug_width;
+          //   if (ev)
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_sub_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          //   else
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_sub_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          // }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          double* dp  = (double*)(dst->p);
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          avx512_interleave64(dp, spl, sph, w);
+        }          
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        avx512_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        avx512_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }
+
+  } // !local
+} // !ojph
+
+#endif
diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h
index 2bf041c8..acf9ee6d 100644
--- a/src/core/transform/ojph_transform_local.h
+++ b/src/core/transform/ojph_transform_local.h
@@ -42,16 +42,13 @@
 #include "ojph_defs.h"
 
 namespace ojph {
-  struct line_buf;
-  namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-    struct LIFTING_FACTORS
-    {
-      static const float steps[8];
-      static const float K;
-      static const float K_inv;
-    };
+  // defined elsewhere
+  class line_buf;
+
+  namespace local {
+    struct param_atk;
+    union lifting_step;
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -62,56 +59,45 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    // Reversible functions
+    // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_fwd_predict(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis);
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_fwd_update(const line_buf* src1,
-                                      const line_buf* src2,
-                                      line_buf *dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                  line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_bwd_predict(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void gen_irv_horz_syn(const param_atk *atk, const line_buf* dst, 
+                          const line_buf *lsrc, const line_buf *hsrc, 
+                          ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_bwd_update(const line_buf* src1,
-                                      const line_buf* src2,
-                                      line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_wvlt_bwd_tx(line_buf* dst, line_buf *lsrc,
-                                  line_buf *hsrc, ui32 width, bool even);
-
-    //////////////////////////////////////////////////////////////////////////
-    // Irreversible functions
-    //////////////////////////////////////////////////////////////////////////
-
+    // Reversible functions
     //////////////////////////////////////////////////////////////////////////
-    void gen_irrev_vert_wvlt_step(const line_buf* src1, const line_buf* src2,
-                                  line_buf *dst, int step_num, ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irrev_vert_wvlt_K(const line_buf *src, line_buf *dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis);
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irrev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irrev_horz_wvlt_bwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -125,21 +111,23 @@ namespace ojph {
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_irrev_vert_wvlt_step(const line_buf* src1, const line_buf* src2,
-                                  line_buf *dst, int step_num, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_irrev_vert_wvlt_K(const line_buf *src, line_buf *dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_irrev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void sse_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_irrev_horz_wvlt_bwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void sse_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                          const line_buf *lsrc, const line_buf *hsrc, 
+                          ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -153,33 +141,20 @@ namespace ojph {
     // Reversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_fwd_predict(const line_buf* src1,
-                                        const line_buf* src2,
-                                        line_buf *dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_fwd_update(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                   line_buf *hdst, ui32 width, bool even);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_bwd_predict(const line_buf* src1,
-                                        const line_buf* src2,
-                                        line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_bwd_update(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_wvlt_bwd_tx(line_buf* dst, line_buf *lsrc,
-                                   line_buf *hsrc, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even);
 
 
     //////////////////////////////////////////////////////////////////////////
@@ -194,21 +169,23 @@ namespace ojph {
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_irrev_vert_wvlt_step(const line_buf* src1, const line_buf* src2,
-                                  line_buf *dst, int step_num, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis);
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_irrev_vert_wvlt_K(const line_buf *src, line_buf *dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_irrev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void avx_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_irrev_horz_wvlt_bwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void avx_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                          const line_buf *lsrc, const line_buf *hsrc, 
+                          ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -222,97 +199,119 @@ namespace ojph {
     // Reversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_fwd_predict(const line_buf* src1,
-                                        const line_buf* src2,
-                                        line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_fwd_update(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                   line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis);
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_bwd_predict(const line_buf* src1,
-                                        const line_buf* src2,
-                                        line_buf *dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_bwd_update(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_wvlt_bwd_tx(line_buf* dst, line_buf *lsrc,
-                                   line_buf *hsrc, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
     //
     //
-    //                          WASM Functions
+    //                        AVX512 Functions
     //
     //
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    // Reversible functions
+    // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_fwd_predict(const line_buf *line_src1, 
-                                        const line_buf *line_src2,
-                                        line_buf *line_dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_fwd_update(const line_buf *line_src1, 
-                                       const line_buf *line_src2,
-                                       line_buf *line_dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                   line_buf *line_hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                             const line_buf *lsrc, const line_buf *hsrc, 
+                             ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_bwd_predict(const line_buf *line_src1, 
-                                        const line_buf *line_src2,
-                                        line_buf *line_dst, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_bwd_update(const line_buf *line_src1, 
-                                       const line_buf *line_src2,
-                                       line_buf *line_dst, ui32 repeat);
+    // Reversible functions
+    //////////////////////////////////////////////////////////////////////////
 
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig,
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even);
+
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //                          WASM Functions
+    //
+    //
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, 
-                                   line_buf *line_hsrc, ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_vert_wvlt_step(const line_buf* line_src1, 
-                                   const line_buf* line_src2,
-                                   line_buf *line_dst, int step_num, 
-                                   ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_vert_wvlt_K(const line_buf *line_src, line_buf *line_dst,
-                                bool L_analysis_or_H_synthesis, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                     line_buf *line_hdst, ui32 width, 
-                                     bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                           const line_buf *lsrc, const line_buf *hsrc, 
+                           ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_horz_wvlt_bwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                     line_buf *line_hdst, ui32 width, 
-                                     bool even);
+    // Reversible functions
+    //////////////////////////////////////////////////////////////////////////
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig,
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis);
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even);
   }
 }
 
diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp
index c299bc8d..aaa8c327 100644
--- a/src/core/transform/ojph_transform_sse.cpp
+++ b/src/core/transform/ojph_transform_sse.cpp
@@ -35,281 +35,253 @@
 // Date: 28 August 2019
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
 #include <cstdio>
+#include <xmmintrin.h>
 
 #include "ojph_defs.h"
-#include "ojph_arch.h"
 #include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
-#include <immintrin.h>
-
 namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void sse_irrev_vert_wvlt_step(const line_buf* line_src1,
-                                  const line_buf* line_src2,
-                                  line_buf *line_dst,
-                                  int step_num, ui32 repeat)
+    static inline
+    void sse_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        __m128 a = _mm_load_ps(sp);
+        __m128 b = _mm_load_ps(sp + 4);
+        __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+        __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm_store_ps(dpl, c);
+        _mm_store_ps(dph, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void sse_interleave32(float* dp, float* spl, float* sph, int width)                      \
+    {
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m128 a = _mm_load_ps(spl);
+        __m128 b = _mm_load_ps(sph);
+        __m128 c = _mm_unpacklo_ps(a, b);
+        __m128 d = _mm_unpackhi_ps(a, b);
+        _mm_store_ps(dp, c);
+        _mm_store_ps(dp + 4, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void sse_multiply_const(float* p, float f, int width)
+    {
+      __m128 factor = _mm_set1_ps(f);
+      for (; width > 0; width -= 4, p += 4)
+      {
+        __m128 s = _mm_load_ps(p);
+        _mm_store_ps(p, _mm_mul_ps(factor, s));
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis)
     {
-      float *dst = line_dst->f32;
-      const float *src1 = line_src1->f32, *src2 = line_src2->f32;
+      float a = s->irv.Aatk;
+      if (synthesis)
+        a = -a;
+
+      __m128 factor = _mm_set1_ps(a);
 
-      __m128 factor = _mm_set1_ps(LIFTING_FACTORS::steps[step_num]);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
       {
         __m128 s1 = _mm_load_ps(src1);
         __m128 s2 = _mm_load_ps(src2);
-        __m128 d = _mm_load_ps(dst);
+        __m128 d  = _mm_load_ps(dst);
         d = _mm_add_ps(d, _mm_mul_ps(factor, _mm_add_ps(s1, s2)));
         _mm_store_ps(dst, d);
       }
     }
 
-    /////////////////////////////////////////////////////////////////////////
-    void sse_irrev_vert_wvlt_K(const line_buf* line_src, line_buf* line_dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat)
+    //////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
     {
-      float *dst = line_dst->f32;
-      const float *src = line_src->f32;
-
-      float f = LIFTING_FACTORS::K_inv;
-      f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
-      __m128 factor = _mm_set1_ps(f);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src+=4)
-      {
-        __m128 s = _mm_load_ps(src);
-        _mm_store_ps(dst, _mm_mul_ps(factor, s));
-      }
+      sse_multiply_const(aug->f32, K, (int)repeat);
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void sse_irrev_horz_wvlt_fwd_tx(line_buf* line_src, line_buf *line_ldst,
-                                    line_buf *line_hdst, ui32 width,
-                                    bool even)
+    void sse_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *src = line_src->f32;
-        float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const float* sp = src + (even ? 1 : 0);
-        float *dph = hdst;
-        __m128 factor = _mm_set1_ps(LIFTING_FACTORS::steps[0]);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          __m128 s1 = _mm_loadu_ps(sp - 1);
-          __m128 s2 = _mm_loadu_ps(sp + 1);
-          __m128 d = _mm_loadu_ps(sp);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          __m128 d1 = _mm_add_ps(d, s1);
-          sp += 4;
-          s1 = _mm_loadu_ps(sp - 1);
-          s2 = _mm_loadu_ps(sp + 1);
-          d = _mm_loadu_ps(sp);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          __m128 d2 = _mm_add_ps(d, s1);
-          sp += 4;
-          d = _mm_shuffle_ps(d1, d2, _MM_SHUFFLE(2, 0, 2, 0));
-          _mm_store_ps(dph, d);
-        }
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[1]);
-        sp = src + (even ? 0 : 1);
-        const float* sph = hdst + (even ? 0 : 1);
-        float *dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
+        // split src into ldst and hdst
         {
-          __m128 s1 = _mm_loadu_ps(sph - 1);
-          __m128 s2 = _mm_loadu_ps(sph);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          __m128 d1 = _mm_loadu_ps(sp);
-          __m128 d2 = _mm_loadu_ps(sp + 4);
-          __m128 d = _mm_shuffle_ps(d1, d2, _MM_SHUFFLE(2, 0, 2, 0));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dpl, d);
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          sse_deinterleave32(dpl, dph, sp, w);
         }
 
-        //extension
-        ldst[-1] = ldst[0];
-        ldst[L_width] = ldst[L_width-1];
-        //predict
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[2]);
-        const float* spl = ldst + (even ? 1 : 0);
-        dph = hdst;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, spl+=4, dph+=4)
+        // the actual horizontal transform
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          __m128 s1 = _mm_loadu_ps(spl - 1);
-          __m128 s2 = _mm_loadu_ps(spl);
-          __m128 d = _mm_loadu_ps(dph);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dph, d);
-        }
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[3]);
-        sph = hdst + (even ? 0 : 1);
-        dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, dpl+=4)
-        {
-          __m128 s1 = _mm_loadu_ps(sph - 1);
-          __m128 s2 = _mm_loadu_ps(sph);
-          __m128 d = _mm_loadu_ps(dpl);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dpl, d);
-        }
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp;
+          float* dp = hp;
+          int i = (int)h_width;
+          __m128 f = _mm_set1_ps(a);
+          if (even)
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              __m128 m = _mm_load_ps(sp);
+              __m128 n = _mm_loadu_ps(sp + 1);
+              __m128 p = _mm_load_ps(dp);
+              p = _mm_add_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
+              _mm_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              __m128 m = _mm_load_ps(sp);
+              __m128 n = _mm_loadu_ps(sp - 1);
+              __m128 p = _mm_load_ps(dp);
+              p = _mm_add_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
+              _mm_store_ps(dp, p);
+            }
+          }
 
-        //multipliers
-        float *dp = ldst;
-        factor = _mm_set1_ps(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          __m128 d = _mm_load_ps(dp);
-          _mm_store_ps(dp, _mm_mul_ps(factor, d));
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
-        dp = hdst;
-        factor = _mm_set1_ps(LIFTING_FACTORS::K);
-        for (int i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          __m128 d = _mm_load_ps(dp);
-          _mm_store_ps(dp, _mm_mul_ps(factor, d));
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          sse_multiply_const(lp, K_inv, (int)l_width);
+          sse_multiply_const(hp, K, (int)h_width);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->f32[0] = line_src->f32[0];
+          ldst->f32[0] = src->f32[0];
         else
-          line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
+          hdst->f32[0] = src->f32[0] * 2.0f;
       }
     }
-
-    /////////////////////////////////////////////////////////////////////////
-    void sse_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
-                                    line_buf *line_hsrc, ui32 width,
-                                    bool even)
+    
+    //////////////////////////////////////////////////////////////////////////
+    void sse_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
-        float *dst = line_dst->f32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
 
-        //multipliers
-        float *dp = lsrc;
-        __m128 factor = _mm_set1_ps(LIFTING_FACTORS::K);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          __m128 d = _mm_load_ps(dp);
-          _mm_store_ps(dp, _mm_mul_ps(factor, d));
-        }
-        dp = hsrc;
-        factor = _mm_set1_ps(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          __m128 d = _mm_load_ps(dp);
-          _mm_store_ps(dp, _mm_mul_ps(factor, d));
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          sse_multiply_const(aug, K, (int)aug_width);
+          sse_multiply_const(oth, K_inv, (int)oth_width);
         }
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[7]);
-        const float *sph = hsrc + (even ? 0 : 1);
-        float *dpl = lsrc;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
+        // the actual horizontal transform
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          __m128 s1 = _mm_loadu_ps(sph - 1);
-          __m128 s2 = _mm_loadu_ps(sph);
-          __m128 d = _mm_loadu_ps(dpl);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dpl, d);
-        }
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[6]);
-        const float *spl = lsrc + (even ? 0 : -1);
-        float *dph = hsrc;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4, spl+=4)
-        {
-          __m128 s1 = _mm_loadu_ps(spl);
-          __m128 s2 = _mm_loadu_ps(spl + 1);
-          __m128 d = _mm_loadu_ps(dph);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dph, d);
-        }
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth;
+          float* dp = aug;
+          int i = (int)aug_width;
+          __m128 f = _mm_set1_ps(a);
+          if (ev)
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              __m128 m = _mm_load_ps(sp);
+              __m128 n = _mm_loadu_ps(sp - 1);
+              __m128 p = _mm_load_ps(dp);
+              p = _mm_sub_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
+              _mm_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              __m128 m = _mm_load_ps(sp);
+              __m128 n = _mm_loadu_ps(sp + 1);
+              __m128 p = _mm_load_ps(dp);
+              p = _mm_sub_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
+              _mm_store_ps(dp, p);
+            }
+          }
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[5]);
-        sph = hsrc + (even ? 0 : 1);
-        dpl = lsrc;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
-        {
-          __m128 s1 = _mm_loadu_ps(sph - 1);
-          __m128 s2 = _mm_loadu_ps(sph);
-          __m128 d = _mm_loadu_ps(dpl);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dpl, d);
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict and combine
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[4]);
-        dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 3) >> 2; i > 0; --i, spl+=4, sph+=4, dp+=8)
+        // combine both lsrc and hsrc into dst
         {
-          __m128 s1 = _mm_loadu_ps(spl);
-          __m128 s2 = _mm_loadu_ps(spl + 1);
-          __m128 d = _mm_load_ps(sph);
-          s2 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s2);
-          _mm_storeu_ps(dp, _mm_unpacklo_ps(s1, d));
-          _mm_storeu_ps(dp + 4, _mm_unpackhi_ps(s1, d));
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          sse_interleave32(dp, spl, sph, w);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->f32[0] = line_lsrc->f32[0];
+          dst->f32[0] = lsrc->f32[0];
         else
-          line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
       }
     }
-  }
-}
+
+  } // !local
+} // !ojph
+
+#endif
diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp
index a607441a..54770ff6 100644
--- a/src/core/transform/ojph_transform_sse2.cpp
+++ b/src/core/transform/ojph_transform_sse2.cpp
@@ -35,224 +35,972 @@
 // Date: 28 August 2019
 //***************************************************************************/
 
+#include "ojph_arch.h"
+#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
+
+#include <climits>
 #include <cstdio>
 
 #include "ojph_defs.h"
-#include "ojph_arch.h"
 #include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
-#include <immintrin.h>
+#include <emmintrin.h>
 
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_fwd_predict(const line_buf* line_src1,
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) 
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
+      // note than m must be obtained using
+      // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
+      __m128i x = _mm_srli_epi64(a, amt);
+      x = _mm_xor_si128(x, m);
+      __m128i result = _mm_sub_epi64(x, m);
+      return result;
+    }
 
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void sse2_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
       {
-        __m128i s1 = _mm_load_si128((__m128i*)src1);
-        __m128i s2 = _mm_load_si128((__m128i*)src2);
-        __m128i d = _mm_load_si128((__m128i*)dst);
-        s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
-        d = _mm_sub_epi32(d, s1);
-        _mm_store_si128((__m128i*)dst, d);
+        __m128 a = _mm_load_ps(sp);
+        __m128 b = _mm_load_ps(sp + 4);
+        __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+        __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm_store_ps(dpl, c);
+        _mm_store_ps(dph, d);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_fwd_update(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    static inline
+    void sse2_interleave32(float* dp, float* spl, float* sph, int width)                      \
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      __m128i offset = _mm_set1_epi32(2);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
       {
-        __m128i s1 = _mm_load_si128((__m128i*)src1);
-        s1 = _mm_add_epi32(s1, offset);
-        __m128i s2 = _mm_load_si128((__m128i*)src2);
-        s2 = _mm_add_epi32(s2, s1);
-        __m128i d = _mm_load_si128((__m128i*)dst);
-        d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
-        _mm_store_si128((__m128i*)dst, d);
+        __m128 a = _mm_load_ps(spl);
+        __m128 b = _mm_load_ps(sph);
+        __m128 c = _mm_unpacklo_ps(a, b);
+        __m128 d = _mm_unpackhi_ps(a, b);
+        _mm_store_ps(dp, c);
+        _mm_store_ps(dp + 4, d);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
-                                   line_buf *line_hdst, ui32 width, bool even)
+    static inline 
+    void sse2_deinterleave64(double* dpl, double* dph, double* sp, int width)
+    {
+      for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
+      {
+        __m128d a = _mm_load_pd(sp);
+        __m128d b = _mm_load_pd(sp + 2);
+        __m128d c = _mm_shuffle_pd(a, b, 0);
+        __m128d d = _mm_shuffle_pd(a, b, 3);
+        _mm_store_pd(dpl, c);
+        _mm_store_pd(dph, d);
+      }
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void sse2_interleave64(double* dp, double* spl, double* sph, int width)
+    {
+      for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
+      {
+        __m128d a = _mm_load_pd(spl);
+        __m128d b = _mm_load_pd(sph);
+        __m128d c = _mm_unpacklo_pd(a, b);
+        __m128d d = _mm_unpackhi_pd(a, b);
+        _mm_store_pd(dp, c);
+        _mm_store_pd(dp + 2, d);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void sse2_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m128i vb = _mm_set1_epi32(b);
+
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i v = _mm_add_epi32(vb, t);
+            __m128i w = _mm_srai_epi32(v, e);
+            d = _mm_sub_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i v = _mm_add_epi32(vb, t);
+            __m128i w = _mm_srai_epi32(v, e);
+            d = _mm_add_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i w = _mm_srai_epi32(t, e);
+            d = _mm_add_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i w = _mm_srai_epi32(t, e);
+            d = _mm_sub_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i v = _mm_sub_epi32(vb, t);
+            __m128i w = _mm_srai_epi32(v, e);
+            d = _mm_sub_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i v = _mm_sub_epi32(vb, t);
+            __m128i w = _mm_srai_epi32(v, e);
+            d = _mm_add_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else { // general case
+        // 32bit multiplication is not supported in sse2; we need sse4.1,
+        // where we can use _mm_mullo_epi32, which multiplies 32bit x 32bit,
+        // keeping the LSBs
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void sse2_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si64 a = s->rev.Aatk;
+      const si64 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m128i vb = _mm_set1_epi64x(b);
+      __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_add_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_sub_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_add_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_add_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i w = sse2_mm_srai_epi64(t, e, ve);
+            d = _mm_add_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i w = sse2_mm_srai_epi64(t, e, ve);
+            d = _mm_sub_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_sub_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_sub_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_sub_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_add_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else { // general case
+        // 64bit multiplication is not supported in sse2
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void sse2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *src = line_src->i32;
-        si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const si32* sp = src + (even ? 1 : 0);
-        si32 *dph = hdst;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          __m128i s1 = _mm_loadu_si128((__m128i*)(sp-1));
-          __m128i s2 = _mm_loadu_si128((__m128i*)(sp+1));
-          __m128i d = _mm_loadu_si128((__m128i*)sp);
-          s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
-          __m128i d1 = _mm_sub_epi32(d, s1);
-          sp += 4;
-          s1 = _mm_loadu_si128((__m128i*)(sp-1));
-          s2 = _mm_loadu_si128((__m128i*)(sp+1));
-          d = _mm_loadu_si128((__m128i*)sp);
-          s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
-          __m128i d2 = _mm_sub_epi32(d, s1);
-          sp += 4;
-          d = _mm_castps_si128(_mm_shuffle_ps(
-              _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
-          _mm_store_si128((__m128i*)dph, d);
+        // split src into ldst and hdst
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          sse2_deinterleave32(dpl, dph, sp, w);
         }
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        sp = src + (even ? 0 : 1);
-        const si32* sph = hdst + (even ? 0 : 1);
-        si32 *dpl = ldst;
-        __m128i offset = _mm_set1_epi32(2);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
-          s1 = _mm_add_epi32(s1, offset);
-          __m128i s2 = _mm_loadu_si128((__m128i*)sph);
-          s2 = _mm_add_epi32(s2, s1);
-          __m128i d1 = _mm_loadu_si128((__m128i*)sp);
-          __m128i d2 = _mm_loadu_si128((__m128i*)sp + 1);
-          __m128i d = _mm_castps_si128(_mm_shuffle_ps(
-              _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
-          d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
-          _mm_store_si128((__m128i*)dpl, d);
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m128i vb = _mm_set1_epi32(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp;
+          si32* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_add_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_add_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i w = _mm_srai_epi32(t, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i w = _mm_srai_epi32(t, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_sub_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_sub_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in sse2
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->i32[0] = line_src->i32[0];
+          ldst->i32[0] = src->i32[0];
         else
-          line_hdst->i32[0] = line_src->i32[0] << 1;
+          hdst->i32[0] = src->i32[0] << 1;
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_bwd_predict(const line_buf* line_src1,
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void sse2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      if (width > 1)
       {
-        __m128i s1 = _mm_load_si128((__m128i*)src1);
-        __m128i s2 = _mm_load_si128((__m128i*)src2);
-        __m128i d = _mm_load_si128((__m128i*)dst);
-        s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
-        d = _mm_add_epi32(d, s1);
-        _mm_store_si128((__m128i*)dst, d);
+        // split src into ldst and hdst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)src->p;
+          int w = (int)width;
+          sse2_deinterleave64(dpl, dph, sp, w);
+        }
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m128i vb = _mm_set1_epi64x(b);
+          __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in sse2
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_bwd_update(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    }    
     
-      __m128i offset = _mm_set1_epi32(2);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
       {
-        __m128i s1 = _mm_load_si128((__m128i*)src1);
-        s1 = _mm_add_epi32(s1, offset);
-        __m128i s2 = _mm_load_si128((__m128i*)src2);
-        s2 = _mm_add_epi32(s2, s1);
-        __m128i d = _mm_load_si128((__m128i*)dst);
-        d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
-        _mm_store_si128((__m128i*)dst, d);
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m128i vb = _mm_set1_epi32(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth;
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_add_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_add_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i w = _mm_srai_epi32(t, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i w = _mm_srai_epi32(t, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_sub_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_sub_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 32bit multiplication is not supported in sse2; we need sse4.1,
+            // where we can use _mm_mullo_epi32, which multiplies
+            // 32bit x 32bit, keeping the LSBs
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
+
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          sse2_interleave32(dp, spl, sph, w);
+        }
+      }
+      else {
+        if (even)
+          dst->i32[0] = lsrc->i32[0];
+        else
+          dst->i32[0] = hsrc->i32[0] >> 1;
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc,
-                                   line_buf *line_hsrc, ui32 width, bool even)
+    void sse2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
-        si32 *dst = line_dst->i32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        const si32 *sph = hsrc + (even ? 0 : 1);
-        si32 *spl = lsrc;
-        __m128i offset = _mm_set1_epi32(2);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, spl+=4)
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
-          s1 = _mm_add_epi32(s1, offset);
-          __m128i s2 = _mm_loadu_si128((__m128i*)sph);
-          s2 = _mm_add_epi32(s2, s1);
-          __m128i d = _mm_load_si128((__m128i*)spl);
-          d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
-          _mm_store_si128((__m128i*)spl, d);
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m128i vb = _mm_set1_epi64x(b);
+          __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in sse2
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        // extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width - 1];
-        // inverse predict and combine
-        si32 *dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 3) >> 2; i > 0; --i, sph+=4, spl+=4, dp+=8)
+        // combine both lsrc and hsrc into dst
         {
-          __m128i s1 = _mm_loadu_si128((__m128i*)spl);
-          __m128i s2 = _mm_loadu_si128((__m128i*)(spl+1));
-          __m128i d = _mm_load_si128((__m128i*)sph);
-          s2 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
-          d = _mm_add_epi32(d, s2);
-          _mm_storeu_si128((__m128i*)dp, _mm_unpacklo_epi32(s1, d));
-          _mm_storeu_si128((__m128i*)dp + 1, _mm_unpackhi_epi32(s1, d));
+          double* dp  = (double*)dst->p;
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          sse2_interleave64(dp, spl, sph, w);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->i32[0] = line_lsrc->i32[0];
+          dst->i64[0] = lsrc->i64[0];
         else
-          line_dst->i32[0] = line_hsrc->i32[0] >> 1;
+          dst->i64[0] = hsrc->i64[0] >> 1;
       }
     }
-  }
-}
+
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }    
+
+  } // !local
+} // !ojph
+
+#endif
diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp
index 8f48e352..341cfc32 100644
--- a/src/core/transform/ojph_transform_wasm.cpp
+++ b/src/core/transform/ojph_transform_wasm.cpp
@@ -41,6 +41,9 @@
 #include "ojph_defs.h"
 #include "ojph_arch.h"
 #include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
@@ -48,473 +51,1260 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_fwd_predict(const line_buf* line_src1, 
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
+    static inline
+    void wasm_deinterleave32(float* dpl, float* dph, float* sp, int width)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
       {
-        v128_t s1 = wasm_v128_load(src1);
-        v128_t s2 = wasm_v128_load(src2);
-        v128_t d = wasm_v128_load(dst);
-        s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-        d = wasm_i32x4_sub(d, s1);
-        wasm_v128_store(dst, d);
+        v128_t a = wasm_v128_load(sp);
+        v128_t b = wasm_v128_load(sp + 4);
+        v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
+        v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
+        // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+        // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+        wasm_v128_store(dpl, c);
+        wasm_v128_store(dph, d);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_fwd_update(const line_buf* line_src1, 
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    static inline
+    void wasm_interleave32(float* dp, float* spl, float* sph, int width)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-
-      v128_t offset = wasm_i32x4_splat(2);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
       {
-        v128_t s1 = wasm_v128_load(src1);
-        s1 = wasm_i32x4_add(s1, offset);
-        v128_t s2 = wasm_v128_load(src2);
-        s2 = wasm_i32x4_add(s2, s1);
-        v128_t d = wasm_v128_load(dst);
-        d = wasm_i32x4_add(d, wasm_i32x4_shr(s2, 2));
-        wasm_v128_store(dst, d);
+        v128_t a = wasm_v128_load(spl);
+        v128_t b = wasm_v128_load(sph);
+        v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
+        v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
+        // v128_t c = _mm_unpacklo_ps(a, b);
+        // v128_t d = _mm_unpackhi_ps(a, b);
+        wasm_v128_store(dp, c);
+        wasm_v128_store(dp + 4, d);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                   line_buf *line_hdst, ui32 width, bool even)
+    static inline 
+    void wasm_deinterleave64(double* dpl, double* dph, double* sp, int width)
     {
-      if (width > 1)
+      for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
       {
-        si32 *src = line_src->i32;
-        si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const si32* sp = src + (even ? 1 : 0);
-        si32 *dph = hdst;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          v128_t s1 = wasm_v128_load(sp - 1);
-          v128_t s2 = wasm_v128_load(sp + 1);
-          v128_t d = wasm_v128_load(sp);
-          s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-          v128_t d1 = wasm_i32x4_sub(d, s1);
-          sp += 4;
-          s1 = wasm_v128_load(sp - 1);
-          s2 = wasm_v128_load(sp + 1);
-          d = wasm_v128_load(sp);
-          s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-          v128_t d2 = wasm_i32x4_sub(d, s1);
-          sp += 4;
-          d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          wasm_v128_store(dph, d);
-        }
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        sp = src + (even ? 0 : 1);
-        const si32* sph = hdst + (even ? 0 : 1);
-        si32 *dpl = ldst;
-        v128_t offset = wasm_i32x4_splat(2);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
-        {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          s1 = wasm_i32x4_add(s1, offset);
-          v128_t s2 = wasm_v128_load(sph);
-          s2 = wasm_i32x4_add(s2, s1);
-          v128_t d1 = wasm_v128_load(sp);
-          v128_t d2 = wasm_v128_load(sp + 4);
-          v128_t d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          d = wasm_i32x4_add(d, wasm_i32x4_shr(s2, 2));
-          wasm_v128_store(dpl, d);
-        }
+        v128_t a = wasm_v128_load(sp);
+        v128_t b = wasm_v128_load(sp + 2);
+        v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0);
+        v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1);
+        wasm_v128_store(dpl, c);
+        wasm_v128_store(dph, d);
       }
-      else
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void wasm_interleave64(double* dp, double* spl, double* sph, int width)
+    {
+      for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
       {
-        if (even)
-          line_ldst->i32[0] = line_src->i32[0];
-        else
-          line_hdst->i32[0] = line_src->i32[0] << 1;
+        v128_t a = wasm_v128_load(spl);
+        v128_t b = wasm_v128_load(sph);
+        v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0);
+        v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1);
+        wasm_v128_store(dp, c);
+        wasm_v128_store(dp + 2, d);
       }
-    }
+    }    
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_bwd_predict(const line_buf *line_src1, 
-                                        const line_buf *line_src2,
-                                        line_buf *line_dst, ui32 repeat)
+    static inline void wasm_multiply_const(float* p, float f, int width)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      v128_t factor = wasm_f32x4_splat(f);
+      for (; width > 0; width -= 4, p += 4)
       {
-        v128_t s1 = wasm_v128_load(src1);
-        v128_t s2 = wasm_v128_load(src2);
-        v128_t d = wasm_v128_load(dst);
-        s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-        d = wasm_i32x4_add(d, s1);
-        wasm_v128_store(dst, d);
+        v128_t s = wasm_v128_load(p);
+        wasm_v128_store(p, wasm_f32x4_mul(factor, s));
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_bwd_update(const line_buf *line_src1, 
-                                       const line_buf *line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    void wasm_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      v128_t offset = wasm_i32x4_splat(2);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      float a = s->irv.Aatk;
+      if (synthesis)
+        a = -a;
+
+      v128_t factor = wasm_f32x4_splat(a);
+
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
       {
         v128_t s1 = wasm_v128_load(src1);
-        s1 = wasm_i32x4_add(s1, offset);
         v128_t s2 = wasm_v128_load(src2);
-        s2 = wasm_i32x4_add(s2, s1);
-        v128_t d = wasm_v128_load(dst);
-        d = wasm_i32x4_sub(d, wasm_i32x4_shr(s2, 2));
+        v128_t d  = wasm_v128_load(dst);
+        d = wasm_f32x4_add(d, wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2)));
         wasm_v128_store(dst, d);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, 
-                                   line_buf *line_hsrc, ui32 width, bool even)
+    void wasm_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      wasm_multiply_const(aug->f32, K, (int)repeat);
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
-        si32 *dst = line_dst->i32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        const si32 *sph = hsrc + (even ? 0 : 1);
-        si32 *spl = lsrc;
-        v128_t offset = wasm_i32x4_splat(2);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, spl+=4)
+        // split src into ldst and hdst
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          s1 = wasm_i32x4_add(s1, offset);
-          v128_t s2 = wasm_v128_load(sph);
-          s2 = wasm_i32x4_add(s2, s1);
-          v128_t d = wasm_v128_load(spl);
-          d = wasm_i32x4_sub(d, wasm_i32x4_shr(s2, 2));
-          wasm_v128_store(spl, d);
-        }
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          wasm_deinterleave32(dpl, dph, sp, w);
+        }        
 
-        // extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width - 1];
-        // inverse predict and combine
-        si32 *dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 3) >> 2; i > 0; --i, sph+=4, spl+=4, dp+=8)
+        // the actual horizontal transform
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          v128_t s1 = wasm_v128_load(spl);
-          v128_t s2 = wasm_v128_load(spl + 1);
-          v128_t d = wasm_v128_load(sph);
-          s2 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-          d = wasm_i32x4_add(d, s2);
-          wasm_v128_store(dp, wasm_i32x4_shuffle(s1, d, 0, 4, 1, 5));
-          wasm_v128_store(dp + 4, wasm_i32x4_shuffle(s1, d, 2, 6, 3, 7));
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp;
+          float* dp = hp;
+          int i = (int)h_width;
+          v128_t f = wasm_f32x4_splat(a);
+          if (even)
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp + 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp - 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          wasm_multiply_const(lp, K_inv, (int)l_width);
+          wasm_multiply_const(hp, K, (int)h_width);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->i32[0] = line_lsrc->i32[0];
+          ldst->f32[0] = src->f32[0];
         else
-          line_dst->i32[0] = line_hsrc->i32[0] >> 1;
+          hdst->f32[0] = src->f32[0] * 2.0f;
       }
     }
     
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_vert_wvlt_step(const line_buf *line_src1, 
-                                   const line_buf *line_src2,
-                                   line_buf *line_dst, int step_num, 
-                                   ui32 repeat)
+    void wasm_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
     {
-      float *dst = line_dst->f32;
-      const float *src1 = line_src1->f32, *src2 = line_src2->f32;
-    
-      v128_t factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[step_num]);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      if (width > 1)
       {
-        v128_t s1 = wasm_v128_load(src1);
-        v128_t s2 = wasm_v128_load(src2);
-        v128_t d = wasm_v128_load(dst);
-        d = wasm_f32x4_add(d, wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2)));
-        wasm_v128_store(dst, d);
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          wasm_multiply_const(aug, K, (int)aug_width);
+          wasm_multiply_const(oth, K_inv, (int)oth_width);
+        }
+
+        // the actual horizontal transform
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth;
+          float* dp = aug;
+          int i = (int)aug_width;
+          v128_t f = wasm_f32x4_splat(a);
+          if (ev)
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp - 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+          else
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp + 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          wasm_interleave32(dp, spl, sph, w);
+        }        
+      }
+      else {
+        if (even)
+          dst->f32[0] = lsrc->f32[0];
+        else
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
       }
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_vert_wvlt_K(const line_buf *line_src, line_buf *line_dst,
-                                bool L_analysis_or_H_synthesis, ui32 repeat)
+    void wasm_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
     {
-      float *dst = line_dst->f32;
-      const float *src = line_src->f32;
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      v128_t va = wasm_i32x4_splat(a);
+      v128_t vb = wasm_i32x4_splat(b);
 
-      float f = LIFTING_FACTORS::K_inv;
-      f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
-      v128_t factor = wasm_f32x4_splat(f);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src+=4)
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_add(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_add(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t w = wasm_i32x4_shr(t, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t w = wasm_i32x4_shr(t, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_sub(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_sub(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else 
+      { // general case
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t u = wasm_i32x4_mul(va, t);
+            v128_t v = wasm_i32x4_add(vb, u);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t u = wasm_i32x4_mul(va, t);
+            v128_t v = wasm_i32x4_add(vb, u);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      v128_t va = wasm_i64x2_splat(a);
+      v128_t vb = wasm_i64x2_splat(b);
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_add(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_add(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t w = wasm_i64x2_shr(t, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t w = wasm_i64x2_shr(t, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_sub(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_sub(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else 
+      { // general case
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t u = wasm_i64x2_mul(va, t);
+            v128_t v = wasm_i64x2_add(vb, u);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t u = wasm_i64x2_mul(va, t);
+            v128_t v = wasm_i64x2_add(vb, u);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
       {
-        v128_t s = wasm_v128_load(src);
-        wasm_v128_store(dst, wasm_f32x4_mul(factor, s));
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        wasm_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        wasm_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
       }
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                     line_buf *line_hdst, ui32 width, 
-                                     bool even)
+    static
+    void wasm_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *src = line_src->f32;
-        float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const float* sp = src + (even ? 1 : 0);
-        float *dph = hdst;
-        v128_t factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[0]);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          v128_t s1 = wasm_v128_load(sp - 1);
-          v128_t s2 = wasm_v128_load(sp + 1);
-          v128_t d = wasm_v128_load(sp);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          v128_t d1 = wasm_f32x4_add(d, s1);
-          sp += 4;
-          s1 = wasm_v128_load(sp - 1);
-          s2 = wasm_v128_load(sp + 1);
-          d = wasm_v128_load(sp);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          v128_t d2 = wasm_f32x4_add(d, s1);
-          sp += 4;
-          d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          wasm_v128_store(dph, d);
-        }
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[1]);
-        sp = src + (even ? 0 : 1);
-        const float* sph = hdst + (even ? 0 : 1);
-        float *dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
+        // combine both lsrc and hsrc into dst
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          v128_t d1 = wasm_v128_load(sp);
-          v128_t d2 = wasm_v128_load(sp + 4);
-          v128_t d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
-        }
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          wasm_deinterleave32(dpl, dph, sp, w);
+        }        
 
-        //extension
-        ldst[-1] = ldst[0];
-        ldst[L_width] = ldst[L_width-1];
-        //predict
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[2]);
-        const float* spl = ldst + (even ? 1 : 0);
-        dph = hdst;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, spl+=4, dph+=4)
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          v128_t s1 = wasm_v128_load(spl - 1);
-          v128_t s2 = wasm_v128_load(spl);
-          v128_t d = wasm_v128_load(dph);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dph, d);
-        }
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = wasm_i32x4_splat(a);
+          v128_t vb = wasm_i32x4_splat(b);
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[3]);
-        sph = hdst + (even ? 0 : 1);
-        dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, dpl+=4)
-        {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          v128_t d = wasm_v128_load(dpl);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
-        }
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp;
+          si32* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t u = wasm_i32x4_mul(va, t);
+                v128_t v = wasm_i32x4_add(vb, u);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t u = wasm_i32x4_mul(va, t);                
+                v128_t v = wasm_i32x4_add(vb, u);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
 
-        //multipliers
-        float *dp = ldst;
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
-        }
-        dp = hdst;
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::K);
-        for (int i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->f32[0] = line_src->f32[0];
+          ldst->i32[0] = src->i32[0];
         else
-          line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
+          hdst->i32[0] = src->i32[0] << 1;
       }
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, 
-                                     line_buf *line_hsrc, ui32 width, 
-                                     bool even)
+    static
+    void wasm_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
-        float *dst = line_dst->f32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //multipliers
-        float *dp = lsrc;
-        v128_t factor = wasm_f32x4_splat(LIFTING_FACTORS::K);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
+        // combine both lsrc and hsrc into dst
         {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
-        }
-        dp = hsrc;
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)src->p;
+          int w = (int)width;
+          wasm_deinterleave64(dpl, dph, sp, w);
+        }        
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = wasm_i64x2_splat(a);
+          v128_t vb = wasm_i64x2_splat(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);                
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[7]);
-        const float *sph = hsrc + (even ? 0 : 1);
-        float *dpl = lsrc;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        wasm_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        wasm_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    } 
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          v128_t d = wasm_v128_load(dpl);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = wasm_i32x4_splat(a);
+          v128_t vb = wasm_i32x4_splat(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth;
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t u = wasm_i32x4_mul(va, t);
+                v128_t v = wasm_i32x4_add(vb, u);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t u = wasm_i32x4_mul(va, t);                
+                v128_t v = wasm_i32x4_add(vb, u);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[6]);
-        const float *spl = lsrc + (even ? 0 : -1);
-        float *dph = hsrc;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4, spl+=4)
+        // combine both lsrc and hsrc into dst
         {
-          v128_t s1 = wasm_v128_load(spl);
-          v128_t s2 = wasm_v128_load(spl + 1);
-          v128_t d = wasm_v128_load(dph);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dph, d);
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          wasm_interleave32(dp, spl, sph, w);
         }
-
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[5]);
-        sph = hsrc + (even ? 0 : 1);
-        dpl = lsrc;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
+      }
+      else {
+        if (even)
+          dst->i32[0] = lsrc->i32[0];
+        else
+          dst->i32[0] = hsrc->i32[0] >> 1;
+      }
+    }
+    
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          v128_t d = wasm_v128_load(dpl);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = wasm_i64x2_splat(a);
+          v128_t vb = wasm_i64x2_splat(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);                
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict and combine
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[4]);
-        dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 3) >> 2; i > 0; --i, spl+=4, sph+=4, dp+=8)
+        // combine both lsrc and hsrc into dst
         {
-          v128_t s1 = wasm_v128_load(spl);
-          v128_t s2 = wasm_v128_load(spl + 1);
-          v128_t d = wasm_v128_load(sph);
-          s2 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s2);
-          wasm_v128_store(dp, wasm_i32x4_shuffle(s1, d, 0, 4, 1, 5));
-          wasm_v128_store(dp + 4, wasm_i32x4_shuffle(s1, d, 2, 6, 3, 7));
+          double* dp  = (double*)dst->p;
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          wasm_interleave64(dp, spl, sph, w);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->f32[0] = line_lsrc->f32[0];
+          dst->i64[0] = lsrc->i64[0];
         else
-          line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
+          dst->i64[0] = hsrc->i64[0] >> 1;
       }
     }
 
-  }
-}
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        wasm_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        wasm_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    } 
+
+  } // !local
+} // !ojph
diff --git a/src/openjph-config.cmake.in b/src/openjph-config.cmake.in
new file mode 100644
index 00000000..89dcc67e
--- /dev/null
+++ b/src/openjph-config.cmake.in
@@ -0,0 +1,5 @@
+@PACKAGE_INIT@
+
+include("${CMAKE_CURRENT_LIST_DIR}/openjph-targets.cmake")
+
+check_required_components(openjph)
diff --git a/src/openjph.pc.in b/src/openjph.pc.in
new file mode 100644
index 00000000..e146001d
--- /dev/null
+++ b/src/openjph.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@PKG_CONFIG_LIBDIR@
+includedir=@PKG_CONFIG_INCLUDEDIR@
+
+Name: @PROJECT_NAME@
+Description: @PROJECT_DESCRIPTION@
+Version: @PROJECT_VERSION@
+Requires: @PKG_CONFIG_REQUIRES@
+Libs: -L${libdir} -lopenjph
+Cflags: -I${includedir} -D_FILE_OFFSET_BITS=64
diff --git a/src/pkg-config.pc.cmake b/src/pkg-config.pc.cmake
deleted file mode 100644
index 99bec574..00000000
--- a/src/pkg-config.pc.cmake
+++ /dev/null
@@ -1,9 +0,0 @@
-Name: ${PROJECT_NAME}
-Description: ${PROJECT_DESCRIPTION}
-Version: ${PROJECT_VERSION}
-Requires: ${PKG_CONFIG_REQUIRES}
-prefix=${CMAKE_INSTALL_PREFIX}
-includedir=${PKG_CONFIG_INCLUDEDIR}
-libdir=${PKG_CONFIG_LIBDIR}
-Libs: ${PKG_CONFIG_LIBS}
-Cflags: ${PKG_CONFIG_CFLAGS}
diff --git a/subprojects/js/CMakeLists.txt b/subprojects/js/CMakeLists.txt
index 1f79418c..0dba2baf 100644
--- a/subprojects/js/CMakeLists.txt
+++ b/subprojects/js/CMakeLists.txt
@@ -2,18 +2,26 @@ cmake_minimum_required(VERSION 3.10.0)
 
 set(CMAKE_SYSTEM_NAME Generic)
 
-project (openjphwasm DESCRIPTION "Open source implementation of JPH" LANGUAGES CXX)
-
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/../html)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/../html)
+project (OpenJPH_WASM DESCRIPTION "Open source implementation of JPH" LANGUAGES CXX)
 
 add_subdirectory("../.." openjph EXCLUDE_FROM_ALL)
 add_executable(libopenjph "src/ojph_wrapper.cpp")
-set_target_properties(libopenjph PROPERTIES SUFFIX ".js" LINK_FLAGS "-O3 -s WASM=1 -s EXPORT_ES6=1 -s MODULARIZE=1 -s ENVIRONMENT=web -s EXPORTED_FUNCTIONS=[_free,_malloc] -s EXPORTED_RUNTIME_METHODS=[ccall,cwrap,writeArrayToMemory] -s NO_EXIT_RUNTIME=1 -s ALLOW_MEMORY_GROWTH=1")
+if (OJPH_DISABLE_SIMD)
+else()
+  target_compile_options(libopenjph PRIVATE -DOJPH_ENABLE_WASM_SIMD -msimd128)
+endif()
+set_target_properties(libopenjph PROPERTIES SUFFIX ".js")
+target_link_options(libopenjph PRIVATE
+  -fexceptions
+  -sWASM=1
+  -sEXPORT_ES6=1
+  -sMODULARIZE=1
+  -sENVIRONMENT=web
+  -sEXPORTED_FUNCTIONS=[_free,_malloc]
+  -sEXPORTED_RUNTIME_METHODS=[ccall,cwrap,writeArrayToMemory]
+  -sNO_EXIT_RUNTIME=1
+  -sALLOW_MEMORY_GROWTH=1
+  -sINITIAL_MEMORY=134217728
+)
 target_link_libraries(libopenjph PRIVATE openjph)
 
-add_executable(libopenjph_simd "src/ojph_wrapper.cpp" )
-target_compile_options(libopenjph_simd PRIVATE -DOJPH_ENABLE_WASM_SIMD -msimd128)
-set_target_properties(libopenjph_simd PROPERTIES SUFFIX ".js" LINK_FLAGS "-O3 -s WASM=1 -s EXPORT_ES6=1 -s MODULARIZE=1 -s ENVIRONMENT=web -s EXPORTED_FUNCTIONS=[_free,_malloc] -s EXPORTED_RUNTIME_METHODS=[ccall,cwrap,writeArrayToMemory] -s NO_EXIT_RUNTIME=1 -s ALLOW_MEMORY_GROWTH=1")
-target_link_libraries(libopenjph_simd PRIVATE openjphsimd)
-
diff --git a/subprojects/js/build.sh b/subprojects/js/build.sh
index dd292ad5..0a2dd4ad 100755
--- a/subprojects/js/build.sh
+++ b/subprojects/js/build.sh
@@ -1,5 +1,9 @@
-#!/bin/sh
+#!/bin/bash
+
 mkdir -p build
-#(cd build && emcmake cmake -DCMAKE_BUILD_TYPE=Debug ..)
-(cd build && emcmake cmake ..)
-(cd build && emmake make VERBOSE=1 -j)
+cd build
+emcmake cmake .. -DCMAKE_BUILD_TYPE=Release -DOJPH_DISABLE_SIMD=ON && emmake make -j8 && mv libopenjph.* ../html/
+emcmake cmake .. -DCMAKE_BUILD_TYPE=Release -DOJPH_DISABLE_SIMD=OFF && emmake make -j8 && mv libopenjph.wasm ../html/libopenjph_simd.wasm
+cd ..
+sed 's/libopenjph.wasm/libopenjph_simd.wasm/g' build/libopenjph.js > html/libopenjph_simd.js
+rm build/libopenjph.js
diff --git a/subprojects/js/standalone/README.md b/subprojects/js/standalone/README.md
index 58a0c482..b9974b0b 100644
--- a/subprojects/js/standalone/README.md
+++ b/subprojects/js/standalone/README.md
@@ -43,6 +43,15 @@ To test the WASM subroutines, I would like to use the same test subroutines for
 
 This is a standard build without any modifications. The modified com_decom.sh and com_decom_yuv.sh files should invoke the WASM versions of the code.  You can modify these files to test with or without WASM SIMD.
 
+## Truncated codestream demo
+
+`truncated_decode_demo.sh` demonstrates decoder behavior with a physically
+truncated codestream. It keeps only the first 10 KiB from a valid `.j2c` input
+and invokes `ojph_expand`.
+
+The expected behavior is graceful process termination (possibly with non-zero
+exit status), rather than an uncaught exception/abort.
+
 There is a small complication with the test.  The test reads uncompressed images from ```..\```, and writes compressed images to ```.\```, while wasmer has access to ```..``` only with the command ```--dir ..```.  It would be convenient if I can specify two folders, but I do not know how to do that.   To overcome this, I also use ```--mapdir ./:./``` that maps the ```.\``` folder to the same location in wasmer, thus giving wasmer access to this folder.
 
 
diff --git a/subprojects/js/standalone/truncated_decode_demo.sh b/subprojects/js/standalone/truncated_decode_demo.sh
new file mode 100644
index 00000000..5457c429
--- /dev/null
+++ b/subprojects/js/standalone/truncated_decode_demo.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+set -euo pipefail
+
+# Demo: verify truncated codestreams fail gracefully (no process abort).
+# Usage:
+#   ./truncated_decode_demo.sh <input.j2c> <output.pgm>
+#
+# The script keeps only the first 10 KiB from <input.j2c>, writes a truncated
+# codestream to a temp file, and runs ojph_expand on it. A non-zero return code
+# is acceptable; the important behavior is graceful termination.
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <input.j2c> <output.pgm>"
+  exit 2
+fi
+
+INPUT_J2C="$1"
+OUTPUT_PGM="$2"
+TRUNCATED_J2C="$(mktemp /tmp/openjph-truncated-XXXXXX.j2c)"
+
+trap 'rm -f "$TRUNCATED_J2C"' EXIT
+
+dd if="$INPUT_J2C" of="$TRUNCATED_J2C" bs=1024 count=10 status=none
+
+set +e
+./ojph_expand -i "$TRUNCATED_J2C" -o "$OUTPUT_PGM"
+RESULT=$?
+set -e
+
+echo "ojph_expand return code: $RESULT"
+echo "If this process exits normally (even with non-zero code), the decoder handled truncation without aborting."
diff --git a/target_arch.cmake b/target_arch.cmake
new file mode 100644
index 00000000..727b9978
--- /dev/null
+++ b/target_arch.cmake
@@ -0,0 +1,68 @@
+# This is to detect the target architecture.
+# The detection relies on the compiler's "#error" preprocessor directive to emit the architecture.
+
+# This is inspired by https://github.com/axr/solar-cmake/blob/master/TargetArch.cmake
+# which is inspired by 
+# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
+
+set(archdetect_c_code "
+#if defined(__arm__) || defined(__TARGET_ARCH_ARM)  \
+  || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+  #error cmake_ARCH OJPH_ARCH_ARM
+#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
+  #error cmake_ARCH OJPH_ARCH_I386
+#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
+  #error cmake_ARCH OJPH_ARCH_X86_64
+#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+  #error cmake_ARCH OJPH_ARCH_IA64
+#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
+  || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC)  \\
+  || defined(_M_MPPC) || defined(_M_PPC)
+  #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
+    #error cmake_ARCH OJPH_ARCH_PPC64
+  #else
+    #error cmake_ARCH OJPH_ARCH_PPC
+  #endif
+#endif
+
+#error cmake_ARCH OJPH_ARCH_UNKNOWN
+")
+
+function(target_architecture output_var)
+
+  file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
+
+  enable_language(C)
+
+  # Detect the architecture in a rather creative way...
+  # This compiles a small C program which is a series of ifdefs that selects a
+  # particular #error preprocessor directive whose message string contains the
+  # target architecture. The program will always fail to compile (both because
+  # file is not a valid C program, and obviously because of the presence of the
+  # #error preprocessor directives... but by exploiting the preprocessor in this
+  # way, we can detect the correct target architecture even when cross-compiling,
+  # since the program itself never needs to be run (only the compiler/preprocessor)
+  try_run(
+      run_result_unused
+      compile_result_unused
+      "${CMAKE_BINARY_DIR}"
+      "${CMAKE_BINARY_DIR}/arch.c"
+      COMPILE_OUTPUT_VARIABLE ARCH
+  )
+
+  # Parse the architecture name from the compiler output
+  string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
+
+  # Get rid of the value marker leaving just the architecture name
+  string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
+
+  # If we are compiling with an unknown architecture this variable should
+  # already be set to "unknown" but in the case that it's empty (i.e. due
+  # to a typo in the code), then set it to unknown
+  if (NOT ARCH)
+      set(ARCH OJPH_ARCH_UNKNOWN)
+  endif()
+
+  set(${output_var} "${ARCH}" PARENT_SCOPE)
+
+endfunction()
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index abcc7ede..56e6ba35 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -3,11 +3,13 @@
 include(FetchContent)
 FetchContent_Declare(
   googletest
-  URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz
+  URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz
+  EXCLUDE_FROM_ALL
 )
 # For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 set(BUILD_GMOCK OFF CACHE BOOL "" FORCE)
+option(INSTALL_GTEST "Enable installation of googletest." OFF)
 FetchContent_MakeAvailable(googletest)
 
 # Fetch test files
@@ -15,9 +17,8 @@ FetchContent_Declare(
   jp2k_test_codestreams
   URL               https://github.com/aous72/jp2k_test_codestreams/archive/refs/heads/main.zip
   SOURCE_DIR        jp2k_test_codestreams/
-  CONFIGURE_COMMAND ""
 )
-FetchContent_Populate(jp2k_test_codestreams)
+FetchContent_MakeAvailable(jp2k_test_codestreams)
 
 # create the mse_pae executable
 include(mse_pae.cmake)
@@ -42,9 +43,42 @@ target_link_libraries(
 include(GoogleTest)
 gtest_add_tests(TARGET test_executables)
 
-if (WIN32)
-	add_custom_command(TARGET test_executables POST_BUILD
-		COMMAND ${CMAKE_COMMAND} -E copy "../bin/Release/gtest.dll" "./Release/"
-		COMMAND ${CMAKE_COMMAND} -E copy "../bin/Release/gtest_main.dll" "./Release/"
-	)
-endif()
+if (MSVC)
+  add_custom_command(TARGET test_executables POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy "../bin/\$(Configuration)/gtest.dll" "./"
+    COMMAND ${CMAKE_COMMAND} -E copy "../bin/\$(Configuration)/gtest_main.dll" "./"
+    COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_compress>" "./"
+    COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_expand>" "./"
+    COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:openjph>" "./"
+  )
+  add_custom_command(TARGET compare_files POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy "./\$(Configuration)/compare_files.exe" "./"
+  )
+  add_custom_command(TARGET mse_pae POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy "./\$(Configuration)/mse_pae.exe" "./"
+  )
+  if (OJPH_ENABLE_TIFF_SUPPORT)
+      file(COPY "${TIFF_INCLUDE_DIR}\\..\\bin\\tiff.dll" DESTINATION "./")
+      file(COPY "${TIFF_INCLUDE_DIR}\\..\\bin\\tiffxx.dll" DESTINATION "./")
+      file(COPY "${TIFF_INCLUDE_DIR}\\..\\bin\\tiffd.dll" DESTINATION "./")
+      file(COPY "${TIFF_INCLUDE_DIR}\\..\\bin\\tiffxxd.dll" DESTINATION "./")
+  endif()
+else()
+  add_custom_command(TARGET test_executables POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_expand>" "./"
+    COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_compress>" "./"
+  )
+  if(EMSCRIPTEN)
+    add_custom_command(TARGET test_executables POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE_DIR:ojph_expand>/ojph_expand.wasm" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE_DIR:ojph_compress>/ojph_compress.wasm" "./"
+    )
+  endif(EMSCRIPTEN)
+  if(CYGWIN OR MINGW)
+    add_custom_command(TARGET test_executables POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy "../bin/${CMAKE_SHARED_LIBRARY_PREFIX}gtest.dll" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "../bin/${CMAKE_SHARED_LIBRARY_PREFIX}gtest_main.dll" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:openjph>" "./"
+    )
+  endif()
+endif(MSVC)
diff --git a/tests/mse_pae.cmake b/tests/mse_pae.cmake
index 90cea824..8a4642ac 100644
--- a/tests/mse_pae.cmake
+++ b/tests/mse_pae.cmake
@@ -5,32 +5,46 @@
 project (mse_pae DESCRIPTION "A program to find MSE and peak absolute error between two images" LANGUAGES CXX)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
+include_directories(../src/apps/common)
+include_directories(../src/core/openjph)
 
 # Configure source files
-set(mse_pae mse_pae.cpp "../src/apps/others/ojph_img_io.cpp" "../src/core/others/ojph_message.cpp" "../src/core/others/ojph_file.cpp" "../src/core/others/ojph_mem.cpp" "../src/core/others/ojph_arch.cpp")
+set(SOURCES mse_pae.cpp "../src/apps/others/ojph_img_io.cpp" "../src/core/others/ojph_message.cpp" "../src/core/others/ojph_file.cpp" "../src/core/others/ojph_mem.cpp" "../src/core/others/ojph_mem_c.c" "../src/core/others/ojph_arch.cpp")
 set(OJPH_IMG_IO_SSE41 "../src/apps/others/ojph_img_io_sse41.cpp")
 set(OJPH_IMG_IO_AVX2 "../src/apps/others/ojph_img_io_avx2.cpp")
 
 # if SIMD are not disabled
-if(NOT OJPH_DISABLE_INTEL_SIMD)
-  list(APPEND mse_pae ${OJPH_IMG_IO_SSE41})
-  list(APPEND mse_pae ${OJPH_IMG_IO_AVX2})
-endif()
+if (NOT OJPH_DISABLE_SIMD)
+  if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64")
+    OR ("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_I386")
+    OR MULTI_GEN_X86_64)
+
+    if (NOT OJPH_DISABLE_SSE4)
+      list(APPEND SOURCES ${OJPH_IMG_IO_SSE41})
+    endif()
+    if (NOT OJPH_DISABLE_AVX2)
+      list(APPEND SOURCES ${OJPH_IMG_IO_AVX2})
+    endif()
+
+    # Set compilation flags
+    if (MSVC)
+      set_source_files_properties(../src/apps/others/ojph_img_io_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+    else()
+      set_source_files_properties(../src/apps/others/ojph_img_io_sse41.cpp PROPERTIES COMPILE_FLAGS -msse4.1)
+      set_source_files_properties(../src/apps/others/ojph_img_io_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+    endif()
+  endif()
+
+  if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_ARM") OR MULTI_GEN_ARM64)
+
+  endif()
 
-# Set compilation flags
-if (MSVC)
-  set_source_files_properties(../src/apps/others/ojph_img_io_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
-else()
-  set_source_files_properties(../src/apps/others/ojph_img_io_sse41.cpp PROPERTIES COMPILE_FLAGS -msse4.1)
-  set_source_files_properties(../src/apps/others/ojph_img_io_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
 endif()
 
 # Add executable
-add_executable(mse_pae ${mse_pae})
+add_executable(mse_pae ${SOURCES})
 
 # Add tiff library if it is available
-IF( USE_TIFF )
+if( USE_TIFF )
   target_link_libraries (mse_pae ${TIFF_LIBRARIES})
-ELSE()
-  target_link_libraries (mse_pae)
-ENDIF()
\ No newline at end of file
+endif()
diff --git a/tests/mse_pae.cpp b/tests/mse_pae.cpp
index 9924aea8..f1b84e64 100644
--- a/tests/mse_pae.cpp
+++ b/tests/mse_pae.cpp
@@ -40,8 +40,8 @@
 #include <cstdlib>
 #include <stdexcept>
 #include <cctype>
-#include "../common/ojph_img_io.h"
-#include "../common/ojph_mem.h"
+#include "ojph_img_io.h"
+#include "ojph_mem.h"
 
 using namespace ojph;
 using namespace std;
@@ -60,7 +60,8 @@ struct img_info {
     width = height = 0;
     comps[0] = comps[1] = comps[2] = 0;
     format = UNDEFINED;
-    max_val = 0;
+    bit_depth = 0;
+    is_signed = false;
   }
   ~img_info() {
     for (ui32 i = 0; i < num_comps; ++i)
@@ -70,15 +71,16 @@ struct img_info {
     }
   }
   
-  void init(ui32 num_comps, size_t width, size_t height, ui32 max_val,
-            ui32 format=FORMAT444)
+  void init(ui32 num_comps, size_t width, size_t height, ui32 bit_depth,
+            bool is_signed, ui32 format=FORMAT444)
   {
     assert(num_comps <= 3 && comps[0] == NULL);
     this->num_comps = num_comps;
     this->width = width;
     this->height = height;
     this->format = format;
-    this->max_val = max_val;
+    this->bit_depth = bit_depth;
+    this->is_signed = is_signed;
     for (ui32 i = 0; i < num_comps; ++i)
       switch (format)
       {
@@ -114,7 +116,8 @@ struct img_info {
   point downsampling[3];
   si32 *comps[3];
   ui32 format;
-  ui32 max_val;
+  ui32 bit_depth;
+  bool is_signed;
 };
 
 bool is_pnm(const char *filename)
@@ -137,7 +140,7 @@ void load_ppm(const char *filename, img_info& img)
   ui32 num_comps = ppm.get_num_components();
   size_t width = ppm.get_width();
   size_t height = ppm.get_height();
-  img.init(num_comps, width, height, ppm.get_max_val());
+  img.init(num_comps, width, height, ppm.get_bit_depth(0), false);
   
   width = calc_aligned_size<si32, byte_alignment>(width);
   si32 *buffer = new si32[width];
@@ -259,7 +262,7 @@ void load_yuv(const char *filename, img_info& img)
   yuv.set_img_props(s, num_comps, num_comps, downsampling);  
   yuv.open(name_buf);
   
-  img.init(num_comps, s.w, s.h, (1 << bit_depth) - 1, format);
+  img.init(num_comps, s.w, s.h, bit_depth, false, format);
   
   size_t w = calc_aligned_size<si32, byte_alignment>(s.w);
   si32 *buffer = new si32[w];
@@ -281,12 +284,237 @@ void load_yuv(const char *filename, img_info& img)
   delete[] buffer;
 }
 
+bool is_rawl(const char *filename)
+{
+  const char *p = strchr(filename, ':'); // p is either NULL or pointing to ':'
+  if (p != NULL && p - filename >= 5 && p[-5] == '.' && 
+      toupper(p[-4]) == 'R' && toupper(p[-3])== 'A' && 
+      toupper(p[-2]) == 'W' && toupper(p[-1]) == 'L')
+    return true;
+  return false;
+}
+
+void load_rawl(const char *filename, img_info& img)
+{  
+  const char *p = strchr(filename, ':'); // p is either NULL or pointing to ':'
+  const char *name_end = p;
+  if (p == NULL) {
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp\n");
+    exit(-1);
+  }
+  ojph::size s;
+  ++p;
+  s.w = (ui32)atoi(p);
+  p = strchr(p, 'x'); // p is either NULL or pointing to ':'
+  if (p == NULL) {
+    printf("Expecting image height.\n");
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp\n");
+    exit(-1);
+  }
+  ++p;
+  s.h = (ui32)atoi(p);
+  p = strchr(p, 'x'); // p is either NULL or pointing to ':'
+  if (p == NULL) {
+    printf("Expecting image bitdepth.\n");
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp\n");
+    exit(-1);
+  }
+  ++p;
+  ui32 bit_depth = (ui32)atoi(p);
+  p = strchr(p, 'x'); // p is either NULL or pointing to ':'
+  if (p == NULL) {
+    printf("Expecting signedness information (either 0 or 1).\n");
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp, where num_comp is\n");
+    printf("either 1 or 3\n");
+    exit(-1);
+  }
+  ++p;
+  bool is_signed = *p != '0';
+  p = strchr(p, 'x'); // p is either NULL or pointing to ':'
+  if (p == NULL) {
+    printf("Expecting number of components.\n");
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp, where num_comp is\n");
+    printf("either 1 or 3\n");
+    exit(-1);
+  }
+  ++p;
+  ui32 num_comps = (ui32)atoi(p);
+  if (num_comps != 1 && num_comps != 3)
+  {
+    printf("num_comp must be either 1 or 3, %s was supplied.\n", p);
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp, where format is\n");
+    printf("either 1 or 3\n");
+    exit(-1);
+  }
+
+  char name_buf[2048];
+  ptrdiff_t cpy_len = name_end - filename > 2047 ? 2047 : name_end - filename;
+  strncpy(name_buf, filename, (size_t)cpy_len);
+  name_buf[cpy_len] = 0;
+
+  if (num_comps == 3)
+    img.init(num_comps, s.w, s.h, bit_depth, is_signed, FORMAT444);
+  else
+    img.init(num_comps, s.w, s.h, bit_depth, is_signed, FORMAT400);
+
+  if (is_signed)
+  {
+    if (bit_depth <= 8)
+    {
+      si8 *buffer = new si8[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        si8 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 1, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = *sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+    else if (bit_depth <= 16)
+    {
+      si16 *buffer = new si16[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        si16 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 2, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = *sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+    else
+    {
+      si32 *buffer = new si32[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        si32 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 4, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = *sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+  }
+  else
+  {
+    if (bit_depth <= 8)
+    {
+      ui8 *buffer = new ui8[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        ui8 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 1, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = *sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+    else if (bit_depth <= 16)
+    {
+      ui16 *buffer = new ui16[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        ui16 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 2, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = *sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+    else
+    {
+      ui32 *buffer = new ui32[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        ui32 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 4, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = (si32)*sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+  }
+}
+
 void find_mse_pae(const img_info& img1, const img_info& img2, 
                   float mse[3], ui32 pae[3])
 {
   if (img1.num_comps != img2.num_comps || img1.format != img2.format ||
       img1.width != img2.width || img1.height != img2.height ||
-      img1.max_val != img2.max_val)
+      img1.bit_depth != img2.bit_depth || img1.is_signed != img2.is_signed)
   {
     printf("Error: mismatching images\n");
     exit(-1);
@@ -298,26 +526,99 @@ void find_mse_pae(const img_info& img1, const img_info& img2,
     h = (img1.height + img1.downsampling[c].x - 1) / img1.downsampling[c].x;
     double se = 0;
     ui32 lpae = 0;
-    for (ui32 v = 0; v < h; ++v)
-    {
-      si32 *p0 = img1.comps[c] + w * v;
-      si32 *p1 = img2.comps[c] + w * v;
-      for (ui32 s = 0; s < w; ++s)
+    if (img1.is_signed)
+      for (ui32 v = 0; v < h; ++v)
       {
-        si32 err = *p0++ - *p1++;
-        ui32 ae = (ui32)(err > 0 ? err : -err);
-        lpae = ae > lpae ? ae : lpae;
-        se += (double)err * (double)err;
+        si32 *p0 = img1.comps[c] + w * v;
+        si32 *p1 = img2.comps[c] + w * v;
+        for (ui32 s = 0; s < w; ++s)
+        {
+          si32 err = *p0++ - *p1++;
+          ui32 ae = (ui32)(err > 0 ? err : -err);
+          lpae = ae > lpae ? ae : lpae;
+          se += (double)err * (double)err;
+        }
+      }
+    else
+      for (ui32 v = 0; v < h; ++v)
+      {
+        ui32 *p0 = (ui32*)img1.comps[c] + w * v;
+        ui32 *p1 = (ui32*)img2.comps[c] + w * v;
+        for (ui32 s = 0; s < w; ++s)
+        {
+          ui32 a = *p0++;
+          ui32 b = *p1++;
+          ui32 err = a > b ? a - b : b - a;
+          lpae = err > lpae ? err : lpae;
+          se += (double)err * (double)err;
+        }
       }
-    }
     mse[c] = (float)se / (float)(w * h);
     pae[c] = lpae;
   }
-  // float t = 0;
-  // for (ui32 c = 0; c < img1.num_comps; ++c)
-  //   t += (float)mse[c];
-  // t /= (float)num_pixels;
-  // psnr = 10.0f * log10f((float)img1.max_val * (float)img1.max_val / t);
+}
+
+void find_nlt_mse_pae(const img_info& img1, const img_info& img2, 
+                      float mse[3], ui32 pae[3])
+{
+  if (img1.num_comps != img2.num_comps || img1.format != img2.format ||
+      img1.width != img2.width || img1.height != img2.height ||
+      img1.bit_depth != img2.bit_depth || img1.is_signed != img2.is_signed)
+  {
+    printf("Error: mismatching images\n");
+    exit(-1);
+  }
+  if (img1.is_signed)
+    for (ui32 c = 0; c < img1.num_comps; ++c)
+    {
+      size_t w, h;
+      w = (img1.width + img1.downsampling[c].x - 1) / img1.downsampling[c].x;
+      h = (img1.height + img1.downsampling[c].x - 1) / img1.downsampling[c].x;
+      double se = 0;
+      ui32 lpae = 0;
+      si32 bias = (si32)((1ULL << (img1.bit_depth - 1)) + 1);
+      for (ui32 v = 0; v < h; ++v)
+      {
+        si32 *p0 = img1.comps[c] + w * v;
+        si32 *p1 = img2.comps[c] + w * v;
+        for (ui32 s = 0; s < w; ++s)
+        {
+          si32 a = *p0++;
+          si32 b = *p1++;
+          a = (a >= 0) ? a : (- a - bias);
+          b = (b >= 0) ? b : (- b - bias);
+          ui32 err = (ui32)(a > b ? a - b : b - a);
+          lpae = err > lpae ? err : lpae;
+          se += (double)err * (double)err;
+        }
+      }
+      mse[c] = (float)se / (float)(w * h);
+      pae[c] = lpae;
+    }
+  else
+    for (ui32 c = 0; c < img1.num_comps; ++c)
+    {
+      size_t w, h;
+      w = (img1.width + img1.downsampling[c].x - 1) / img1.downsampling[c].x;
+      h = (img1.height + img1.downsampling[c].x - 1) / img1.downsampling[c].x;
+      double se = 0;
+      ui32 lpae = 0;
+      for (ui32 v = 0; v < h; ++v)
+      {
+        ui32 *p0 = (ui32*)img1.comps[c] + w * v;
+        ui32 *p1 = (ui32*)img2.comps[c] + w * v;
+        for (ui32 s = 0; s < w; ++s)
+        {
+          ui32 a = *p0++;
+          ui32 b = *p1++;
+          ui32 err = a > b ? a - b : b - a;
+          lpae = err > lpae ? err : lpae;
+          se += (double)err * (double)err;
+        }
+      }
+      mse[c] = (float)se / (float)(w * h);
+      pae[c] = lpae;
+    }
 }
 
 int main(int argc, char *argv[])
@@ -325,20 +626,36 @@ int main(int argc, char *argv[])
   if (argc < 3)
   {
     printf("mse_pae expects two arguments <filename1, filename2>\n");
+    printf("A third optional argment is \"-nlt\".\n");
     exit(-1);
   }
-    
+
+  bool nlt = false;
+  if (argc == 4)
+  {
+    if (strcmp("-nlt", argv[3]) == 0)
+      nlt = true;
+    else {
+      printf("unknown 4th parameter %s\n", argv[3]);
+      exit(-1);      
+    }
+  }
+
+
   img_info img1, img2;
   try {
     if (is_pnm(argv[1]))
       load_ppm(argv[1], img1);
     else if (is_yuv(argv[1]))
       load_yuv(argv[1], img1);
+    else if (is_rawl(argv[1]))
+      load_rawl(argv[1], img1);
     else {
       printf("mse_pae does not know file format of %s\n", argv[1]);
       printf("or a .yuv that does not have the expected format, which is\n");
       printf(".yuv:widthxheightxbitdepthxformat, where format is\n");
-      printf("either 444, 422, or 420\n");
+      printf("either 444, 422, or 420, or wrongly format .rawl, which has\n");
+      printf(".rawl:widthxheightxbitdepthxsignedxnum_comp format.\n");
       exit(-1);  
     }
   }
@@ -355,11 +672,14 @@ int main(int argc, char *argv[])
       load_ppm(argv[2], img2);
     else if (is_yuv(argv[2]))
       load_yuv(argv[2], img2);
+    else if (is_rawl(argv[2]))
+      load_rawl(argv[2], img2);
     else {
       printf("mse_pae does not know file format of %s\n", argv[2]);
       printf("or a .yuv that does not have the expected format, which is\n");
       printf(".yuv:widthxheightxbitdepthxformat, where format is\n");
-      printf("either 444, 422, or 420\n");
+      printf("either 444, 422, or 420, or wrongly format .rawl, which has\n");
+      printf(".rawl:widthxheightxbitdepthxsignedxnum_comp format.\n");
       exit(-1);  
     }
   }
@@ -372,7 +692,10 @@ int main(int argc, char *argv[])
   }  
   
   float mse[3]; ui32 pae[3];
-  find_mse_pae(img1, img2, mse, pae);
+  if (!nlt)
+    find_mse_pae(img1, img2, mse, pae);
+  else
+    find_nlt_mse_pae(img1, img2, mse, pae);
   
   for (ui32 c = 0; c < img1.num_comps; ++c)
     printf("%f %d\n", mse[c], pae[c]);
diff --git a/tests/test.py b/tests/test.py
deleted file mode 100644
index fb8c0df9..00000000
--- a/tests/test.py
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/python3
-
-import numpy as np
-import cv2
-import matplotlib.pyplot as plt
-
-# import os
-# os.system('')
-# subprocess.run
-
-print('Testing in Python')
-
-im = cv2.imread("../../ARRI_AlexaDrums_3840x2160p_24_12b_P3_444_00000.ppm", cv2.IMREAD_UNCHANGED );
-hist, bin_edges = np.histogram(im.astype('int32'), bins=range(4096));
-_ = plt.hist(hist, bin_edges);
diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp
index 696daadd..22f148e5 100644
--- a/tests/test_executables.cpp
+++ b/tests/test_executables.cpp
@@ -44,7 +44,7 @@
 // STATIC                         ojph_popen
 ////////////////////////////////////////////////////////////////////////////////
 static inline
-FILE *ojph_popen(const char *command, const char *modes) 
+FILE* ojph_popen(const char* command, const char* modes)
 {
 #ifdef OJPH_COMPILER_MSVC
   return _popen(command, modes);
@@ -57,7 +57,7 @@ FILE *ojph_popen(const char *command, const char *modes)
 // STATIC                         ojph_pclose
 ////////////////////////////////////////////////////////////////////////////////
 static inline
-int ojph_pclose(FILE *stream) 
+int ojph_pclose(FILE* stream)
 {
 #ifdef OJPH_COMPILER_MSVC
   return _pclose(stream);
@@ -69,16 +69,16 @@ int ojph_pclose(FILE *stream)
 ////////////////////////////////////////////////////////////////////////////////
 // STATIC                           execute
 ////////////////////////////////////////////////////////////////////////////////
-static 
-int execute(const std::string& cmd, std::string& result) 
+static
+int execute(const std::string& cmd, std::string& result)
 {
   std::array<char, 128> buffer;
   result.clear();
 
   FILE* pipe = ojph_popen(cmd.c_str(), "r");
-  if (!pipe) 
+  if (!pipe)
     throw std::runtime_error("ojph_popen() failed!");
-  
+
   while (!feof(pipe))
     if (fgets(buffer.data(), 128, pipe) != nullptr)
       result += buffer.data();
@@ -94,21 +94,40 @@ int execute(const std::string& cmd, std::string& result)
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifdef OJPH_OS_WINDOWS
-	#define SRC_FILE_DIR ".\\jp2k_test_codestreams\\openjph\\"
-	#define OUT_FILE_DIR ".\\"
-	#define REF_FILE_DIR ".\\jp2k_test_codestreams\\openjph\\references\\"
-	#define MSE_PAE_PATH  ".\\Release\\mse_pae"
-	#define COMPARE_FILES_PATH  ".\\Release\\compare_files"
-	#define EXPAND_EXECUTABLE "..\\..\\bin\\Release\\ojph_expand.exe"
-	#define COMPRESS_EXECUTABLE "..\\..\\bin\\Release\\ojph_compress.exe"
+#define SRC_FILE_DIR ".\\jp2k_test_codestreams\\openjph\\"
+#define OUT_FILE_DIR ".\\"
+#define REF_FILE_DIR ".\\jp2k_test_codestreams\\openjph\\references\\"
+#define MSE_PAE_PATH  ".\\mse_pae"
+#define COMPARE_FILES_PATH  ".\\compare_files"
+#define EXPAND_EXECUTABLE ".\\ojph_expand.exe"
+#define COMPRESS_EXECUTABLE ".\\ojph_compress.exe"
 #else
-	#define SRC_FILE_DIR "./jp2k_test_codestreams/openjph/"
-	#define OUT_FILE_DIR "./"
-	#define REF_FILE_DIR "./jp2k_test_codestreams/openjph/references/"
-	#define MSE_PAE_PATH  "./mse_pae"
-	#define COMPARE_FILES_PATH  "./compare_files"
-	#define EXPAND_EXECUTABLE "../../bin/ojph_expand"
-	#define COMPRESS_EXECUTABLE "../../bin/ojph_compress"
+#define SRC_FILE_DIR "./jp2k_test_codestreams/openjph/"
+#define OUT_FILE_DIR "./"
+#define REF_FILE_DIR "./jp2k_test_codestreams/openjph/references/"
+#define MSE_PAE_PATH  "./mse_pae"
+#define COMPARE_FILES_PATH  "./compare_files"
+
+// This is a comment to me, to help with emscripten testing.
+// This is written after the completion of the tests.
+// 1. Compile for the target platform (Linux), selecting from the following
+//    code the version that suits you; in particular it should be the one
+//    the uses node.  Ideally create two versions of test_executables, one
+//    for WASM SIMD, and for WASM without SIMD -- use linux cp command to
+//    create test_executables_simd and test_executables_no_simd
+// 2. Compile again, without deleting what compiled; this time compile using
+//    emscripten, targeting WASM.  The compilation is very finicky, do
+//    'make clean && make' after every change in code.
+// 3. cd to tests, and run test_executables_simd or test_executables_no_simd.
+
+#define EXPAND_EXECUTABLE "./ojph_expand"
+#define COMPRESS_EXECUTABLE "./ojph_compress"
+//#define EXPAND_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_expand.js"
+//#define COMPRESS_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_compress.js"
+//#define EXPAND_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_expand_simd.js"
+//#define COMPRESS_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_compress_simd.js"
+//#define EXPAND_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_expand"
+//#define COMPRESS_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_compress"
 #endif
 #define TOL_DOUBLE 0.01
 #define TOL_INTEGER 1
@@ -116,22 +135,21 @@ int execute(const std::string& cmd, std::string& result)
 ////////////////////////////////////////////////////////////////////////////////
 //                            run_ojph_compress
 ////////////////////////////////////////////////////////////////////////////////
-void run_ojph_compress(const std::string& ref_filename, 
-                       const std::string& base_filename, 
-                       const std::string& extended_base_fname, 
-                       const std::string& out_ext,
-                       const std::string& extra_options)
+void run_ojph_compress(const std::string& ref_filename,
+  const std::string& base_filename,
+  const std::string& extended_base_fname,
+  const std::string& out_ext,
+  const std::string& extra_options)
 {
   try {
     std::string result, command;
-    command = std::string(COMPRESS_EXECUTABLE) 
+    command = std::string(COMPRESS_EXECUTABLE)
       + " -i " + REF_FILE_DIR + ref_filename
-      + " -o " + OUT_FILE_DIR + base_filename + extended_base_fname + 
+      + " -o " + OUT_FILE_DIR + base_filename + extended_base_fname +
       "." + out_ext + " " + extra_options;
-    std::cerr << command << std::endl;
     EXPECT_EQ(execute(command, result), 0);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
   }
 }
@@ -139,18 +157,18 @@ void run_ojph_compress(const std::string& ref_filename,
 ////////////////////////////////////////////////////////////////////////////////
 //                            run_ojph_expand
 ////////////////////////////////////////////////////////////////////////////////
-void run_ojph_expand(const std::string& base_filename, 
-                     const std::string& src_ext,
-                     const std::string& out_ext)
+void run_ojph_expand(const std::string& base_filename,
+  const std::string& src_ext,
+  const std::string& out_ext)
 {
   try {
     std::string result, command;
-    command = std::string(EXPAND_EXECUTABLE) 
+    command = std::string(EXPAND_EXECUTABLE)
       + " -i " + SRC_FILE_DIR + base_filename + "." + src_ext
       + " -o " + OUT_FILE_DIR + base_filename + "." + out_ext;
     EXPECT_EQ(execute(command, result), 0);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
   }
 }
@@ -158,34 +176,34 @@ void run_ojph_expand(const std::string& base_filename,
 ////////////////////////////////////////////////////////////////////////////////
 //                            run_ojph_compress
 ////////////////////////////////////////////////////////////////////////////////
-void run_ojph_compress_expand(const std::string& base_filename, 
-                              const std::string& out_ext,
-                              const std::string& decode_ext)
+void run_ojph_compress_expand(const std::string& base_filename,
+  const std::string& out_ext,
+  const std::string& decode_ext)
 {
   try {
     std::string result, command;
-    command = std::string(EXPAND_EXECUTABLE) 
+    command = std::string(EXPAND_EXECUTABLE)
       + " -i " + OUT_FILE_DIR + base_filename + "." + out_ext
       + " -o " + OUT_FILE_DIR + base_filename + "." + decode_ext;
     EXPECT_EQ(execute(command, result), 0);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
-  }  
+  }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 //                             run_mse_pae
 ////////////////////////////////////////////////////////////////////////////////
-void run_mse_pae(const std::string& base_filename, 
-                 const std::string& out_ext, 
-                 const std::string& ref_filename, 
-                 const std::string& yuv_specs,
-                 int num_components, double* mse, int* pae) 
+void run_mse_pae(const std::string& base_filename,
+  const std::string& out_ext,
+  const std::string& ref_filename,
+  const std::string& yuv_specs,
+  int num_components, double* mse, int* pae)
 {
   try {
     std::string result, command;
-    command = std::string(MSE_PAE_PATH) 
+    command = std::string(MSE_PAE_PATH)
       + " " + OUT_FILE_DIR + base_filename + "." + out_ext + yuv_specs
       + " " + REF_FILE_DIR + ref_filename + yuv_specs;
     EXPECT_EQ(execute(command, result), 0);
@@ -214,7 +232,7 @@ void run_mse_pae(const std::string& base_filename,
         ++pos;
     }
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
   }
 }
@@ -222,20 +240,20 @@ void run_mse_pae(const std::string& base_filename,
 ////////////////////////////////////////////////////////////////////////////////
 //                             compare_files
 ////////////////////////////////////////////////////////////////////////////////
-void compare_files(const std::string& base_filename, 
-                   const std::string& extended_base_fname, 
-                   const std::string& ext) 
+void compare_files(const std::string& base_filename,
+  const std::string& extended_base_fname,
+  const std::string& ext)
 {
   try {
     std::string result, command;
-    command = std::string(COMPARE_FILES_PATH) 
+    command = std::string(COMPARE_FILES_PATH)
       + " " + OUT_FILE_DIR + base_filename + extended_base_fname + "." + ext
       + " " + SRC_FILE_DIR + base_filename + "." + ext;
     EXPECT_EQ(execute(command, result), 0);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
-  }  
+  }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -249,7 +267,7 @@ TEST(TestExecutables, OpenJPHCompressNoArguments) {
     std::string result;
     EXPECT_EQ(execute(COMPRESS_EXECUTABLE, result), 1);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
   }
 }
@@ -261,7 +279,7 @@ TEST(TestExecutables, OpenJPHExpandNoArguments) {
     std::string result;
     EXPECT_EQ(execute(EXPAND_EXECUTABLE, result), 1);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
   }
 }
@@ -823,6 +841,22 @@ TEST(TestExecutables, SimpleDecRev5364x6416bitGray) {
               "", 1, mse, pae);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// Test ojph_expand with codeblocks when the rev53 wavelet is used.
+// Command-line options used to obtain this file is:
+// -o simple_dec_irv53_bhvhb_low_latency.jph -quiet Corder=PCRL Clevels=5
+// Cmodes=HT|CAUSAL -rate 2 Catk=2 Kkernels:I2=I5X3
+// Cprecincts={16,8192},{8,8192},{4,8192} Cblk={8,256}
+// Cdecomp=B(-:-:-),H(-),V(-),H(-),B(-:-:-) Qstep=0.0001 -precise -no_weights
+// -tolerance 0
+TEST(TestExecutables, SimpleDecIrv53BhvhbLowLatency) {
+  double mse[3] = { 5.52392, 4.01405, 6.8166};
+  int pae[3] = { 16, 17, 23};
+  run_ojph_expand("simple_dec_irv53_bhvhb_low_latency", "jph", "ppm");
+  run_mse_pae("simple_dec_irv53_bhvhb_low_latency", "ppm", "Malamute.ppm",
+              "", 3, mse, pae);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Test ojph_compress with codeblocks when the irv97 wavelet is used.
 // We test by comparing MSE and PAE of decoded images. 
@@ -1015,6 +1049,40 @@ TEST(TestExecutables, SimpleEncIrv9732x128) {
               "Malamute.ppm", "", 3, mse, pae);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// Test ojph_compress with codeblocks when the irv97 wavelet is used.
+// We test by comparing MSE and PAE of decoded images. 
+// The compressed file is obtained using these command-line options:
+// -o simple_enc_irv97_64x64_tiles_33x33_d5.j2c -qstep 0.01 -tile_size {33,33}
+// -num_decomps 5
+TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D5) {
+  double mse[3] = { 1.88906, 1.30757, 2.5347};
+  int pae[3] = { 9, 6, 10};
+  run_ojph_compress("Malamute.ppm",
+                    "simple_enc_irv97_64x64_tiles_33x33_d5", "", "j2c",
+                    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 5");
+  run_ojph_compress_expand("simple_enc_irv97_64x64_tiles_33x33_d5", "j2c", "ppm");
+  run_mse_pae("simple_enc_irv97_64x64_tiles_33x33_d5", "ppm",
+              "Malamute.ppm", "", 3, mse, pae);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Test ojph_compress with codeblocks when the irv97 wavelet is used.
+// We test by comparing MSE and PAE of decoded images. 
+// The compressed file is obtained using these command-line options:
+// -o simple_enc_irv97_64x64_tiles_33x33_d6.j2c -qstep 0.01 -tile_size {33,33}
+// -num_decomps 6
+TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D6) {
+  double mse[3] = { 1.88751, 1.30673, 2.53378};
+  int pae[3] = { 8, 6, 10};
+  run_ojph_compress("Malamute.ppm",
+                    "simple_enc_irv97_64x64_tiles_33x33_d6", "", "j2c",
+                    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 6");
+  run_ojph_compress_expand("simple_enc_irv97_64x64_tiles_33x33_d6", "j2c", "ppm");
+  run_mse_pae("simple_enc_irv97_64x64_tiles_33x33_d6", "ppm",
+              "Malamute.ppm", "", 3, mse, pae);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Test ojph_compress with codeblocks when the irv97 wavelet is used.
 // We test by comparing MSE and PAE of decoded images. 
@@ -1159,6 +1227,40 @@ TEST(TestExecutables, SimpleEncRev534x1024) {
               "Malamute.ppm", "", 3, mse, pae);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// Test ojph_compress with codeblocks when the rev53 wavelet is used.
+// We test by comparing MSE and PAE of decoded images. 
+// The compressed file is obtained using these command-line options:
+// -o simple_enc_rev53_64x64_tiles_33x33_d5.j2c -reversible true -tile_size
+// {32,32} -num_decomps 5
+TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D5) {
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
+  run_ojph_compress("Malamute.ppm",
+                    "simple_enc_rev53_64x64_tiles_33x33_d5", "", "j2c",
+                    "-reversible true -tile_size \"{32,32}\" -num_decomps 5");
+  run_ojph_compress_expand("simple_enc_rev53_64x64_tiles_33x33_d5", "j2c", "ppm");
+  run_mse_pae("simple_enc_rev53_64x64_tiles_33x33_d5", "ppm",
+              "Malamute.ppm", "", 3, mse, pae);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Test ojph_compress with codeblocks when the rev53 wavelet is used.
+// We test by comparing MSE and PAE of decoded images. 
+// The compressed file is obtained using these command-line options:
+// -o simple_enc_rev53_64x64_tiles_33x33_d6.j2c -reversible true -tile_size
+// {32,32} -num_decomps 6
+TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D6) {
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
+  run_ojph_compress("Malamute.ppm",
+                    "simple_enc_rev53_64x64_tiles_33x33_d6", "", "j2c",
+                    "-reversible true -tile_size \"{32,32}\" -num_decomps 6");
+  run_ojph_compress_expand("simple_enc_rev53_64x64_tiles_33x33_d6", "j2c", "ppm");
+  run_mse_pae("simple_enc_rev53_64x64_tiles_33x33_d6", "ppm",
+              "Malamute.ppm", "", 3, mse, pae);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Test ojph_compress with codeblocks when the irv97 wavelet is used.
 // We test by comparing MSE and PAE of decoded images. 
@@ -1220,8 +1322,8 @@ TEST(TestExecutables, SimpleEncIrv97TallNarrow) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_tall_narrow1.j2c -image_offset {1,0} -qstep 0.1
 TEST(TestExecutables, SimpleEncIrv97TallNarrow1) {
-  double mse[3] = { 96.7935, 69.6824, 66.7822};
-  int pae[3] = { 41, 39, 35};
+  double mse[3] = { 100.906, 76.113, 72.8347};
+  int pae[3] = { 39, 35, 34};
   run_ojph_compress("tall_narrow.ppm",
                     "simple_enc_irv97_tall_narrow1", "", "j2c",
                     "-image_offset \"{1,0}\" -qstep 0.1");
@@ -1361,7 +1463,7 @@ TEST(TestExecutables, DpxEnc1280x72016bitResolve18) {
 ////////////////////////////////////////////////////////////////////////////////
 //                                   main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tests/test_helpers/convert_mse_pae_to_tests.cpp b/tests/test_helpers/convert_mse_pae_to_tests.cpp
index 25bf084c..630b6230 100644
--- a/tests/test_helpers/convert_mse_pae_to_tests.cpp
+++ b/tests/test_helpers/convert_mse_pae_to_tests.cpp
@@ -200,8 +200,11 @@ void process_cmdlines(std::ifstream& file,
 
       start_pos = line.find(":");
       if (start_pos != std::string::npos) {
-        size_t end_pos = line.find("\"", start_pos);
-        yuv_specs = line.substr(start_pos, end_pos - start_pos);
+        if (std::isdigit(line.at(start_pos + 1)))
+        {
+          size_t end_pos = line.find("\"", start_pos);
+          yuv_specs = line.substr(start_pos, end_pos - start_pos);
+        }
       }
       break;
     }
diff --git a/tests/test_helpers/ht_cmdlines.txt b/tests/test_helpers/ht_cmdlines.txt
index 55b8e865..3b94c887 100644
--- a/tests/test_helpers/ht_cmdlines.txt
+++ b/tests/test_helpers/ht_cmdlines.txt
@@ -52,33 +52,38 @@ add_test(NAME simple_dec_irv97_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_D
 add_test(NAME simple_dec_rev53_64x64_16bit COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -rdec      "-i ${images_folder}/mm.ppm -o simple_dec_rev53_64x64_16bit.jph      -precise -quiet Creversible=yes -full"  "-i simple_dec_rev53_64x64_16bit.jph      -o test1.ppm -precise -quiet" "-i simple_dec_rev53_64x64_16bit.jph      -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
 add_test(NAME simple_dec_rev53_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -rdec "-i ${images_folder}/mm.pgm -o simple_dec_rev53_64x64_16bit_gray.jph -precise -quiet Creversible=yes -full"  "-i simple_dec_rev53_64x64_16bit_gray.jph -o test1.pgm -precise -quiet" "-i simple_dec_rev53_64x64_16bit_gray.jph -o test2.pgm" "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
 
+add_test(NAME simple_dec_irv53_bhvhb_low_latency COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -dec "-i ${images_folder}/mm.ppm -o simple_dec_irv53_bhvhb_low_latency.jph -quiet Corder=PCRL Clevels=5 Cmodes=HT|CAUSAL -rate 2 Catk=2 Kkernels:I2=I5X3 Cprecincts=\{16,8192\},\{8,8192\},\{4,8192\} Cblk=\{8,256\} Cdecomp=B(-:-:-),H(-),V(-),H(-),B(-:-:-) Qstep=0.0001 -precise -no_weights -tolerance 0"  "-i simple_dec_irv53_bhvhb_low_latency.jph -o test1.ppm -precise -quiet" "-i simple_dec_irv53_bhvhb_low_latency.jph -o test2.ppm" "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
 
 #############################################################
 # Encoding
 #############################################################
 
-add_test(NAME simple_enc_irv97_64x64  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_64x64.j2c            -qstep 0.1"                         "-i simple_enc_irv97_64x64.j2c            -o test1.ppm -precise -quiet" "-i simple_enc_irv97_64x64.j2c            -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_32x32  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_32x32.j2c            -qstep 0.01 -block_size \{32,32\}"  "-i simple_enc_irv97_32x32.j2c            -o test1.ppm -precise -quiet" "-i simple_enc_irv97_32x32.j2c            -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_16x16  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_16x16.j2c            -qstep 0.01 -block_size \{16,16\}"  "-i simple_enc_irv97_16x16.j2c            -o test1.ppm -precise -quiet" "-i simple_enc_irv97_16x16.j2c            -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_4x4    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_4x4.j2c              -qstep 0.01 -block_size \{4,4\}"    "-i simple_enc_irv97_4x4.j2c              -o test1.ppm -precise -quiet" "-i simple_enc_irv97_4x4.j2c              -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_1024x4 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_1024x4.j2c           -qstep 0.01 -block_size \{4,1024\}" "-i simple_enc_irv97_1024x4.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_1024x4.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_4x1024 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_4x1024.j2c           -qstep 0.01 -block_size \{1024,4\}" "-i simple_enc_irv97_4x1024.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_4x1024.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_512x8  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_512x8.j2c            -qstep 0.01 -block_size \{8,512\}"  "-i simple_enc_irv97_512x8.j2c            -o test1.ppm -precise -quiet" "-i simple_enc_irv97_512x8.j2c            -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_8x512  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_8x512.j2c            -qstep 0.01 -block_size \{512,8\}"  "-i simple_enc_irv97_8x512.j2c            -o test1.ppm -precise -quiet" "-i simple_enc_irv97_8x512.j2c            -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_256x16 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_256x16.j2c           -qstep 0.01 -block_size \{16,256\}" "-i simple_enc_irv97_256x16.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_256x16.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_16x256 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_16x256.j2c           -qstep 0.01 -block_size \{256,16\}" "-i simple_enc_irv97_16x256.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_16x256.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_128x32 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_128x32.j2c           -qstep 0.01 -block_size \{32,128\}" "-i simple_enc_irv97_128x32.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_128x32.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_32x128 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_32x128.j2c           -qstep 0.01 -block_size \{128,32\}" "-i simple_enc_irv97_32x128.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_32x128.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_64x64_16bit COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc        "-i ${images_folder}/mm.ppm  -o simple_enc_irv97_64x64_16bit.j2c      -qstep 0.01"                        "-i simple_enc_irv97_64x64_16bit.j2c      -o test1.ppm -precise -quiet" "-i simple_enc_irv97_64x64_16bit.j2c      -o test2.ppm" "${images_folder}/mm.ppm"  "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc   "-i ${images_folder}/mm.pgm  -o simple_enc_irv97_64x64_16bit_gray.j2c -qstep 0.01"                        "-i simple_enc_irv97_64x64_16bit_gray.j2c -o test1.pgm -precise -quiet" "-i simple_enc_irv97_64x64_16bit_gray.j2c -o test2.pgm" "${images_folder}/mm.pgm"  "test1.pgm" "test2.pgm")
-add_test(NAME simple_enc_rev53_64x64_16bit  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc      "-i ${images_folder}/mm.ppm  -o simple_enc_rev53_64x64_16bit.j2c      -reversible true"                   "-i simple_enc_rev53_64x64_16bit.j2c      -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64_16bit.j2c      -o test2.ppm" "${images_folder}/mm.ppm"  "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_64x64_16bit_gray  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.pgm  -o simple_enc_rev53_64x64_16bit_gray.j2c -reversible true"                   "-i simple_enc_rev53_64x64_16bit_gray.j2c -o test1.pgm -precise -quiet" "-i simple_enc_rev53_64x64_16bit_gray.j2c -o test2.pgm" "${images_folder}/mm.pgm"  "test1.pgm" "test2.pgm")
-
-add_test(NAME simple_enc_rev53_64x64  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64.j2c  -reversible true"                        "-i simple_enc_rev53_64x64.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_32x32  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_32x32.j2c  -reversible true -block_size \{32,32\}"  "-i simple_enc_rev53_32x32.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_32x32.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_4x4    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_4x4.j2c    -reversible true -block_size \{4,4\}"    "-i simple_enc_rev53_4x4.j2c    -o test1.ppm -precise -quiet" "-i simple_enc_rev53_4x4.j2c    -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_1024x4 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_1024x4.j2c -reversible true -block_size \{4,1024\}" "-i simple_enc_rev53_1024x4.j2c -o test1.ppm -precise -quiet" "-i simple_enc_rev53_1024x4.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_4x1024 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_4x1024.j2c -reversible true -block_size \{1024,4\}" "-i simple_enc_rev53_4x1024.j2c -o test1.ppm -precise -quiet" "-i simple_enc_rev53_4x1024.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_64x64  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_64x64.j2c             -qstep 0.1"                                           "-i simple_enc_irv97_64x64.j2c             -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_64x64.j2c                -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_32x32  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_32x32.j2c             -qstep 0.01 -block_size \{32,32\}"                    "-i simple_enc_irv97_32x32.j2c             -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_32x32.j2c                -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_16x16  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_16x16.j2c             -qstep 0.01 -block_size \{16,16\}"                    "-i simple_enc_irv97_16x16.j2c             -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_16x16.j2c                -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_4x4    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_4x4.j2c               -qstep 0.01 -block_size \{4,4\}"                      "-i simple_enc_irv97_4x4.j2c               -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_4x4.j2c                  -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_1024x4 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_1024x4.j2c            -qstep 0.01 -block_size \{4,1024\}"                   "-i simple_enc_irv97_1024x4.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_1024x4.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_4x1024 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_4x1024.j2c            -qstep 0.01 -block_size \{1024,4\}"                   "-i simple_enc_irv97_4x1024.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_4x1024.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_512x8  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_512x8.j2c             -qstep 0.01 -block_size \{8,512\}"                    "-i simple_enc_irv97_512x8.j2c             -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_512x8.j2c                -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_8x512  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_8x512.j2c             -qstep 0.01 -block_size \{512,8\}"                    "-i simple_enc_irv97_8x512.j2c             -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_8x512.j2c                -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_256x16 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_256x16.j2c            -qstep 0.01 -block_size \{16,256\}"                   "-i simple_enc_irv97_256x16.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_256x16.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_16x256 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_16x256.j2c            -qstep 0.01 -block_size \{256,16\}"                   "-i simple_enc_irv97_16x256.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_16x256.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_128x32 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_128x32.j2c            -qstep 0.01 -block_size \{32,128\}"                   "-i simple_enc_irv97_128x32.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_128x32.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_32x128 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_32x128.j2c            -qstep 0.01 -block_size \{128,32\}"                   "-i simple_enc_irv97_32x128.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_32x128.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_64x64_tiles_33x33_d5 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc   "-i ${images_folder}/mm.ppm -o simple_enc_irv97_64x64_tiles_33x33_d5.j2c -qstep 0.01 -tile_size \{33,33\} -num_decomps 5"   "-i simple_enc_irv97_64x64_tiles_33x33_d5.j2c -o test1.ppm -precise -quiet" "-i simple_enc_irv97_64x64_tiles_33x33_d5.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_64x64_tiles_33x33_d6 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc   "-i ${images_folder}/mm.ppm -o simple_enc_irv97_64x64_tiles_33x33_d6.j2c -qstep 0.01 -tile_size \{33,33\} -num_decomps 6"   "-i simple_enc_irv97_64x64_tiles_33x33_d6.j2c -o test1.ppm -precise -quiet" "-i simple_enc_irv97_64x64_tiles_33x33_d6.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_64x64_16bit COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc            "-i ${images_folder}/mm.ppm  -o simple_enc_irv97_64x64_16bit.j2c      -qstep 0.01"                                          "-i simple_enc_irv97_64x64_16bit.j2c       -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_64x64_16bit.j2c          -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc       "-i ${images_folder}/mm.pgm  -o simple_enc_irv97_64x64_16bit_gray.j2c -qstep 0.01"                                          "-i simple_enc_irv97_64x64_16bit_gray.j2c  -o test1.pgm -precise -quiet"    "-i simple_enc_irv97_64x64_16bit_gray.j2c     -o test2.pgm"    "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
+add_test(NAME simple_enc_rev53_64x64_16bit  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc          "-i ${images_folder}/mm.ppm  -o simple_enc_rev53_64x64_16bit.j2c      -reversible true"                                     "-i simple_enc_rev53_64x64_16bit.j2c       -o test1.ppm -precise -quiet"    "-i simple_enc_rev53_64x64_16bit.j2c          -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_64x64_16bit_gray  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc     "-i ${images_folder}/mm.pgm  -o simple_enc_rev53_64x64_16bit_gray.j2c -reversible true"                                     "-i simple_enc_rev53_64x64_16bit_gray.j2c  -o test1.pgm -precise -quiet"    "-i simple_enc_rev53_64x64_16bit_gray.j2c     -o test2.pgm"    "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
+
+add_test(NAME simple_enc_rev53_64x64  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64.j2c  -reversible true"                                                 "-i simple_enc_rev53_64x64.j2c  -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_64x64.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_32x32  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_32x32.j2c  -reversible true -block_size \{32,32\}"                           "-i simple_enc_rev53_32x32.j2c  -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_32x32.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_4x4    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_4x4.j2c    -reversible true -block_size \{4,4\}"                             "-i simple_enc_rev53_4x4.j2c    -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_4x4.j2c    -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_1024x4 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_1024x4.j2c -reversible true -block_size \{4,1024\}"                          "-i simple_enc_rev53_1024x4.j2c -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_1024x4.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_4x1024 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_4x1024.j2c -reversible true -block_size \{1024,4\}"                          "-i simple_enc_rev53_4x1024.j2c -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_4x1024.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_64x64_tiles_33x33_d5 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64_tiles_33x33_d5.j2c  -reversible true -tile_size \{32,32\} -num_decomps 5" "-i simple_enc_rev53_64x64_tiles_33x33_d5.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64_tiles_33x33_d5.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_64x64_tiles_33x33_d6 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64_tiles_33x33_d6.j2c  -reversible true -tile_size \{32,32\} -num_decomps 6" "-i simple_enc_rev53_64x64_tiles_33x33_d6.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64_tiles_33x33_d6.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
 
 add_test(NAME simple_enc_irv97_64x64_yuv COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom_yuv.sh -enc  "-i ${images_folder}/foreman_420.yuv -o simple_enc_irv97_64x64_yuv.j2c -qstep 0.1 -dims \{352,288\} -num_comps 3 -downsamp \{1,1\},\{2,2\},\{2,2\} -bit_depth 8,8,8 -signed false,false,false"                   "-i simple_enc_irv97_64x64_yuv.j2c -o test1y.rawl,test1u.rawl,test1v.rawl -precise -quiet" "-i simple_enc_irv97_64x64_yuv.j2c -o test2.yuv" "${images_folder}/foreman_420.yuv:352x288x8x420" "test1.yuv:352x288x8x420" "test2.yuv:352x288x8x420")
 add_test(NAME simple_enc_rev53_64x64_yuv COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom_yuv.sh -renc "-i ${images_folder}/foreman_420.yuv -o simple_enc_rev53_64x64_yuv.j2c -reversible true -qstep 0.1 -dims \{352,288\} -num_comps 3 -downsamp \{1,1\},\{2,2\},\{2,2\} -bit_depth 8,8,8 -signed false,false,false"  "-i simple_enc_rev53_64x64_yuv.j2c -o test1y.rawl,test1u.rawl,test1v.rawl -precise -quiet" "-i simple_enc_rev53_64x64_yuv.j2c -o test2.yuv" "${images_folder}/foreman_420.yuv:352x288x8x420" "test1.yuv:352x288x8x420" "test2.yuv:352x288x8x420")