From 4225a524b9d84a0ad9c6f96fa261da65aae144ca Mon Sep 17 00:00:00 2001
From: Robert <robert.trololo@gmail.com>
Date: Fri, 23 Apr 2021 15:03:23 +0200
Subject: [PATCH 1/3] restructured everything

---
 .gitmodules  | 3 ---
 3rdparty/SDL | 1 -
 2 files changed, 4 deletions(-)
 delete mode 160000 3rdparty/SDL

diff --git a/.gitmodules b/.gitmodules
index 1803456..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "3rdparty/SDL"]
-	path = 3rdparty/SDL
-	url = https://github.com/spurious/SDL-mirror
diff --git a/3rdparty/SDL b/3rdparty/SDL
deleted file mode 160000
index f1e51f7..0000000
--- a/3rdparty/SDL
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f1e51f71a9ddbc3d4414b35a694f24c0aad8cb9e

From c66cae17f2095bfe0a7bcb4bd3e59f8c55ea5750 Mon Sep 17 00:00:00 2001
From: Robert <robert.trololo@gmail.com>
Date: Fri, 23 Apr 2021 15:08:51 +0200
Subject: [PATCH 2/3] restructured everything

---
 .gitmodules                                   |    0
 CMakeLists.txt                                |   37 +-
 SDLU/CMakeLists.txt                           |   23 -
 SDLU/SDLU.hpp                                 |    7 -
 SDLU/alibi.cpp                                |    1 -
 SDLU/exceptions/Exceptions.hpp                |   12 -
 SDLU/exceptions/ObjectCreationException.hpp   |   31 -
 SDLU/graphics/CMakeLists.txt                  |    9 -
 SDLU/graphics/RenderTarget.cpp                |   86 -
 SDLU/graphics/RenderTarget.hpp                |   81 -
 SDLU/graphics/RenderWindow.cpp                |   40 -
 SDLU/graphics/RenderWindow.hpp                |   72 -
 SDLU/graphics/drawable/CMakeLists.txt         |    7 -
 SDLU/graphics/drawable/Drawable.hpp           |   29 -
 SDLU/graphics/drawable/Transformable.cpp      |   95 -
 SDLU/graphics/drawable/Transformable.hpp      |  158 -
 SDLU/graphics/drawable/shapes/CMakeLists.txt  |    6 -
 SDLU/graphics/drawable/shapes/Shape.hpp       |   46 -
 SDLU/structures/CMakeLists.txt                |    9 -
 SDLU/structures/Color.cpp                     |  163 -
 SDLU/structures/Mouse.cpp                     |   38 -
 SDLU/structures/Mouse.hpp                     |   77 -
 SDLU/structures/Vector2.hpp                   |  152 -
 SDLU/structures/Window.cpp                    |  253 -
 SDLU/structures/Window.hpp                    |  269 -
 SDLU_Example/CMakeLists.txt                   |   21 -
 examples/CMakeLists.txt                       |   15 +
 {SDLU_Example => examples}/header.hpp         |    5 +-
 {SDLU_Example => examples}/main.cpp           |   11 +-
 include/SDLU.hpp                              |   10 +
 {SDLU => include}/Util.hpp                    |    8 +-
 {SDLU => include}/graphics/Graphics.hpp       |    0
 include/graphics/RenderTarget.hpp             |   79 +
 include/graphics/RenderWindow.hpp             |   69 +
 include/graphics/drawable/Drawable.hpp        |   30 +
 include/graphics/drawable/Transformable.hpp   |  157 +
 .../graphics/drawable/shapes/Rectangle.hpp    |    2 -
 include/graphics/drawable/shapes/Shape.hpp    |   45 +
 {SDLU => include}/structures/Color.hpp        |    7 +-
 include/structures/Mouse.hpp                  |   75 +
 include/structures/Vector2.hpp                |  154 +
 include/structures/Window.hpp                 |  279 +
 lib/sdl2_gfx/CMakeLists.txt                   |   20 +
 lib/sdl2_gfx/include/SDL2_framerate.h         |  100 +
 lib/sdl2_gfx/include/SDL2_gfxPrimitives.h     |  241 +
 .../include/SDL2_gfxPrimitives_font.h         | 3106 +++++++
 lib/sdl2_gfx/include/SDL2_imageFilter.h       |  166 +
 lib/sdl2_gfx/include/SDL2_rotozoom.h          |  123 +
 lib/sdl2_gfx/src/SDL2_framerate.c             |  189 +
 lib/sdl2_gfx/src/SDL2_gfxPrimitives.c         | 3790 +++++++++
 lib/sdl2_gfx/src/SDL2_imageFilter.c           | 7371 +++++++++++++++++
 lib/sdl2_gfx/src/SDL2_rotozoom.c              | 1663 ++++
 src/SDLU.cpp                                  |   18 +
 src/graphics/RenderTarget.cpp                 |   79 +
 src/graphics/RenderWindow.cpp                 |   37 +
 src/graphics/drawable/Transformable.cpp       |   94 +
 .../graphics/drawable/shapes/Rectangle.cpp    |    5 +-
 .../graphics/drawable/shapes/Shape.cpp        |    2 +-
 src/structures/Color.cpp                      |  162 +
 src/structures/Mouse.cpp                      |   37 +
 src/structures/Window.cpp                     |  245 +
 61 files changed, 18406 insertions(+), 1710 deletions(-)
 delete mode 100644 .gitmodules
 delete mode 100644 SDLU/CMakeLists.txt
 delete mode 100644 SDLU/SDLU.hpp
 delete mode 100644 SDLU/alibi.cpp
 delete mode 100644 SDLU/exceptions/Exceptions.hpp
 delete mode 100644 SDLU/exceptions/ObjectCreationException.hpp
 delete mode 100644 SDLU/graphics/CMakeLists.txt
 delete mode 100644 SDLU/graphics/RenderTarget.cpp
 delete mode 100644 SDLU/graphics/RenderTarget.hpp
 delete mode 100644 SDLU/graphics/RenderWindow.cpp
 delete mode 100644 SDLU/graphics/RenderWindow.hpp
 delete mode 100644 SDLU/graphics/drawable/CMakeLists.txt
 delete mode 100644 SDLU/graphics/drawable/Drawable.hpp
 delete mode 100644 SDLU/graphics/drawable/Transformable.cpp
 delete mode 100644 SDLU/graphics/drawable/Transformable.hpp
 delete mode 100644 SDLU/graphics/drawable/shapes/CMakeLists.txt
 delete mode 100644 SDLU/graphics/drawable/shapes/Shape.hpp
 delete mode 100644 SDLU/structures/CMakeLists.txt
 delete mode 100644 SDLU/structures/Color.cpp
 delete mode 100644 SDLU/structures/Mouse.cpp
 delete mode 100644 SDLU/structures/Mouse.hpp
 delete mode 100644 SDLU/structures/Vector2.hpp
 delete mode 100644 SDLU/structures/Window.cpp
 delete mode 100644 SDLU/structures/Window.hpp
 delete mode 100644 SDLU_Example/CMakeLists.txt
 create mode 100644 examples/CMakeLists.txt
 rename {SDLU_Example => examples}/header.hpp (91%)
 rename {SDLU_Example => examples}/main.cpp (93%)
 create mode 100644 include/SDLU.hpp
 rename {SDLU => include}/Util.hpp (69%)
 rename {SDLU => include}/graphics/Graphics.hpp (100%)
 create mode 100644 include/graphics/RenderTarget.hpp
 create mode 100644 include/graphics/RenderWindow.hpp
 create mode 100644 include/graphics/drawable/Drawable.hpp
 create mode 100644 include/graphics/drawable/Transformable.hpp
 rename {SDLU => include}/graphics/drawable/shapes/Rectangle.hpp (98%)
 create mode 100644 include/graphics/drawable/shapes/Shape.hpp
 rename {SDLU => include}/structures/Color.hpp (99%)
 create mode 100644 include/structures/Mouse.hpp
 create mode 100644 include/structures/Vector2.hpp
 create mode 100644 include/structures/Window.hpp
 create mode 100644 lib/sdl2_gfx/CMakeLists.txt
 create mode 100644 lib/sdl2_gfx/include/SDL2_framerate.h
 create mode 100644 lib/sdl2_gfx/include/SDL2_gfxPrimitives.h
 create mode 100644 lib/sdl2_gfx/include/SDL2_gfxPrimitives_font.h
 create mode 100644 lib/sdl2_gfx/include/SDL2_imageFilter.h
 create mode 100644 lib/sdl2_gfx/include/SDL2_rotozoom.h
 create mode 100644 lib/sdl2_gfx/src/SDL2_framerate.c
 create mode 100644 lib/sdl2_gfx/src/SDL2_gfxPrimitives.c
 create mode 100644 lib/sdl2_gfx/src/SDL2_imageFilter.c
 create mode 100644 lib/sdl2_gfx/src/SDL2_rotozoom.c
 create mode 100644 src/SDLU.cpp
 create mode 100644 src/graphics/RenderTarget.cpp
 create mode 100644 src/graphics/RenderWindow.cpp
 create mode 100644 src/graphics/drawable/Transformable.cpp
 rename {SDLU => src}/graphics/drawable/shapes/Rectangle.cpp (92%)
 rename {SDLU => src}/graphics/drawable/shapes/Shape.cpp (85%)
 create mode 100644 src/structures/Color.cpp
 create mode 100644 src/structures/Mouse.cpp
 create mode 100644 src/structures/Window.cpp

diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index e69de29..0000000
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e8028f..183cd4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,12 +2,35 @@ cmake_minimum_required(VERSION 3.8)
 
 project(sdlu)
 
-if(NOT ENABLE_EXAMPLE)
-    set(ENABLE_EXAMPLE CACHE BOOL OFF)
-endif()
+find_package(SDL2 CONFIG REQUIRED)
 
-add_subdirectory(3rdparty/SDL)
-add_subdirectory(SDLU)
-if(ENABLE_EXAMPLE)
-   add_subdirectory(SDLU_Example)
+option(BUILD_EXAMPLES "Builds the example projects" ON)
+
+add_subdirectory(lib/sdl2_gfx)
+
+file(GLOB_RECURSE sdlu_includes
+    "include/*.hpp"
+)
+
+file(GLOB_RECURSE sdlu_sources
+    "src/*.cpp"
+)
+
+add_library(sdlu
+    ${sdlu_includes} ${sdlu_sources}
+)
+
+target_include_directories(sdlu PUBLIC
+    "include"
+    SDL2::SDL2
+    sdl2_gfx
+)
+
+target_link_libraries(sdlu PUBLIC
+    sdl2_gfx
+    SDL2::SDL2 SDL2::SDL2main
+)
+
+if(BUILD_EXAMPLES)
+    add_subdirectory(examples)
 endif()
\ No newline at end of file
diff --git a/SDLU/CMakeLists.txt b/SDLU/CMakeLists.txt
deleted file mode 100644
index 6b2499d..0000000
--- a/SDLU/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-set(PNAME SDLU)
-
-add_library(${PNAME}
-	alibi.cpp SDLU.hpp Util.hpp
- "structures/Color.cpp" "structures/Mouse.cpp" "structures/Window.cpp" "graphics/RenderTarget.cpp" "graphics/drawable/Transformable.cpp" "graphics/drawable/shapes/Rectangle.cpp")
-
-set_property(TARGET ${PNAME} PROPERTY CXX_STANDARD 17)
-
-target_include_directories(${PNAME} PRIVATE
-	${PROJECT_SOURCE_DIR}/3rdparty/SDL/include
-	${CMAKE_CURRENT_LIST_DIR}
-)
-
-target_link_libraries(${PNAME}
-	SDL2
-)
-
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-	target_link_libraries(${PNAME} m)
-endif()
-
-add_subdirectory(structures)
-add_subdirectory(graphics)
\ No newline at end of file
diff --git a/SDLU/SDLU.hpp b/SDLU/SDLU.hpp
deleted file mode 100644
index 0a5c61f..0000000
--- a/SDLU/SDLU.hpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#pragma once
-
-#include "graphics/Graphics.hpp"
-
-#include "structures/Mouse.hpp"
-
-#include "exceptions/Exceptions.hpp"
\ No newline at end of file
diff --git a/SDLU/alibi.cpp b/SDLU/alibi.cpp
deleted file mode 100644
index 7a35a5a..0000000
--- a/SDLU/alibi.cpp
+++ /dev/null
@@ -1 +0,0 @@
-// CMake needs a .cpp file to build
\ No newline at end of file
diff --git a/SDLU/exceptions/Exceptions.hpp b/SDLU/exceptions/Exceptions.hpp
deleted file mode 100644
index 1e2fdf5..0000000
--- a/SDLU/exceptions/Exceptions.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/**
- * @file Exceptions.hpp
- * @brief Provides utility and includes all exceptions
- * @author Lauchmelder23
- * @date 16.05.2020
-*/
-#pragma once
-
-#include "ObjectCreationException.hpp"
-
-#define THROW_IF( condition, exception ) ( condition ? throw exception : false)
-#define THROW_IF_NOT( condition, exception ) ( THROW_IF(!condition, exception) )
\ No newline at end of file
diff --git a/SDLU/exceptions/ObjectCreationException.hpp b/SDLU/exceptions/ObjectCreationException.hpp
deleted file mode 100644
index 0aae211..0000000
--- a/SDLU/exceptions/ObjectCreationException.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * @file ObjectCreationException.hpp
- * @brief An exception object to handle failed object creations
- * @author Lauchmelder23
- * @date 16.05.2020
- */
-#pragma once
-#include <exception>
-#include <string>
-
-namespace sdlu
-{
-    class ObjectCreationException : 
-        virtual public std::exception
-    {
-    public:
-        ObjectCreationException(std::string description) :
-            m_pDescription(description)
-        {
-            // Empty
-        }
-
-        virtual const char* what() const throw()
-        {
-            return m_pDescription.c_str();
-        }
-
-    private:
-        std::string m_pDescription;
-    };
-}
\ No newline at end of file
diff --git a/SDLU/graphics/CMakeLists.txt b/SDLU/graphics/CMakeLists.txt
deleted file mode 100644
index f67eed2..0000000
--- a/SDLU/graphics/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-target_sources(${PNAME} PRIVATE
-	${CMAKE_CURRENT_SOURCE_DIR}/Graphics.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/RenderWindow.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/RenderWindow.cpp
-	${CMAKE_CURRENT_SOURCE_DIR}/RenderTarget.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/RenderTarget.cpp
-)
-
-add_subdirectory(drawable)
\ No newline at end of file
diff --git a/SDLU/graphics/RenderTarget.cpp b/SDLU/graphics/RenderTarget.cpp
deleted file mode 100644
index fe475c6..0000000
--- a/SDLU/graphics/RenderTarget.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "RenderTarget.hpp"
-
-#include <Util.hpp>
-#include <exceptions/Exceptions.hpp>
-
-namespace sdlu
-{
-    Uint32 RenderTarget::m_uRenderers = -1;
-
-    RenderTarget::~RenderTarget()
-    {
-        RETURN_IF_NULLPTR(renderer);
-
-        SDL_DestroyRenderer(renderer);
-    }
-
-    void RenderTarget::Clear(const Color& color)
-    {
-        RETURN_IF_NULLPTR(renderer);
-
-        SDL_SetRenderDrawColor(renderer, color.r, color.g, color.b, color.a);
-        SDL_RenderClear(renderer);
-    }
-
-    void RenderTarget::Draw(const Drawable& drawable)
-    {
-        RETURN_IF_NULLPTR(renderer);
-
-        drawable.Draw(renderer);
-    }
-
-    void RenderTarget::Display()
-    {
-        RETURN_IF_NULLPTR(renderer);
-
-        SDL_RenderPresent(renderer);
-
-        if (m_oFramerate != 0)
-        {
-            Uint64 diff = std::chrono::duration_cast<std::chrono::milliseconds>(
-                std::chrono::steady_clock::now() - m_oTimeSinceLastDisplay).count();
-
-            if (diff < 1000 / m_oFramerate)
-            {
-                SDL_Delay(static_cast<Uint32>(1000 / m_oFramerate - diff));
-            }
-        }
-
-        m_oTimeSinceLastDisplay = std::chrono::steady_clock::now();
-    }
-
-    void RenderTarget::SetMaxFramerate(Uint32 max)
-    {
-        m_oFramerate = max;
-    }
-
-    RenderTarget::RenderTarget(SDL_Window* target) :
-        renderer(nullptr), m_oFramerate(0)
-    {
-        RETURN_IF_NOT_NULLPTR(renderer);
-        renderer = SDL_CreateRenderer(target, m_uRenderers, SDL_RENDERER_ACCELERATED);
-        
-        THROW_IF(IS_NULLPTR(renderer),
-            sdlu::ObjectCreationException("Failed to create SDL_Renderer* from SDL_Window*: "
-                + std::string(SDL_GetError())));
-        m_uRenderers++;
-
-        m_oTimeSinceLastDisplay = std::chrono::steady_clock::now();
-    }
-
-    RenderTarget::RenderTarget(SDL_Surface* target) : 
-        renderer(nullptr), m_oFramerate(0)
-    {
-        m_oFramerate = 0;
-
-        RETURN_IF_NOT_NULLPTR(renderer);
-        renderer = SDL_CreateSoftwareRenderer(target);
-
-        THROW_IF(IS_NULLPTR(renderer),
-            sdlu::ObjectCreationException("Failed to create SDL_Renderer* from SDL_Surface*: "
-                + std::string(SDL_GetError())));
-        m_uRenderers++;
-
-        m_oTimeSinceLastDisplay = std::chrono::steady_clock::now();
-    }
-}
\ No newline at end of file
diff --git a/SDLU/graphics/RenderTarget.hpp b/SDLU/graphics/RenderTarget.hpp
deleted file mode 100644
index d4531f4..0000000
--- a/SDLU/graphics/RenderTarget.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * @file RenderTarget
- * @brief Contains rendering related objects
- * @author Lauchmelder23
- * @date 20.05.2020
- */
-#pragma once
-#include <chrono>
-
-#include <SDL.h>
-
-#include <structures/Color.hpp>
-#include <graphics/drawable/Drawable.hpp>
-
-namespace sdlu
-{
-    /**
-     * @brief Acts as a wrapper for SDL_Renderer*. You can't (and shouldn't)
-     *        instantiate this, but rather derive from it.
-     */ 
-    class RenderTarget
-    {
-    public:
-        virtual ~RenderTarget();
-
-        /**
-         * @brief Clears the display
-         *
-         * @param[in] color The color to clear the display with
-         */
-        void Clear(const Color& color = Color::Black);
-
-        /**
-         * @brief Draws a sdlu::Drawable to the SDL_Renderer
-         *
-         * @param[in] drawable A reference to a derived class of Drawable
-         */
-        void Draw(const Drawable& drawable);
-
-        /**
-         * @brief Display the current state of the renderer to the screen
-         */
-        void Display();
-
-        /**
-         * @brief Sets a maximum framerate on the display function
-         *
-         * If the maximum framerate is not 0, SDL_Delay() will be called
-         * after each Display() to ensure that the time between displays
-         * is not shorter than the framerate limit.
-         *
-         * @param[in] max The new maximum framerate
-         */
-        void SetMaxFramerate(Uint32 max);
-
-    protected:
-        /**
-         * @brief Create Renderer and bind it to a window
-         * 
-         * @param[in] target The SDL_Window to bind to
-         */
-        RenderTarget(SDL_Window* target);
-
-        /**
-         * @brief Create Renderer and bind it to a texture
-         *
-         * @param[in] target The SDL_Surface to bind to
-         */
-        RenderTarget(SDL_Surface* target);
-
-    protected:
-        SDL_Renderer* renderer; ///< The renderer object
-
-    private:
-        Uint32 m_oFramerate; ///< The current maximum framerate of the window (0 = unlimited)
-
-        std::chrono::steady_clock::time_point m_oTimeSinceLastDisplay; ///< The timepoint at which Display() was last called
-
-        static Uint32 m_uRenderers; ///< The number of renderers instantiated so far
-    };
-}
\ No newline at end of file
diff --git a/SDLU/graphics/RenderWindow.cpp b/SDLU/graphics/RenderWindow.cpp
deleted file mode 100644
index 4bdcbe7..0000000
--- a/SDLU/graphics/RenderWindow.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "RenderWindow.hpp"
-
-#include <cstring>
-
-#include <exceptions/Exceptions.hpp>
-#include <Util.hpp>
-
-namespace sdlu
-{
-    RenderWindow::RenderWindow() :
-        Window(), RenderTarget(window)
-    {
-        // Empty
-    }
-
-    RenderWindow::RenderWindow(Vector2u dimension, const std::string& title,
-        Uint32 windowFlags) :
-        Window(dimension, title, windowFlags), RenderTarget(window)
-    {
-        // Empty
-    }
-
-    RenderWindow::~RenderWindow()
-    {
-        // Empty
-    }
-
-    void RenderWindow::OnCreate()
-    {
-    }
-
-    bool RenderWindow::OnResize()
-    {
-        return false;
-    }
-
-    void RenderWindow::OnClose()
-    {
-    }
-}
\ No newline at end of file
diff --git a/SDLU/graphics/RenderWindow.hpp b/SDLU/graphics/RenderWindow.hpp
deleted file mode 100644
index 2de90f5..0000000
--- a/SDLU/graphics/RenderWindow.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * @file RenderWindow.hpp
- * @brief A wrapper around SDL_Window and SDL_Renderer
- * @author Lauchmelder23
- * @date 16.05.2020
- */
-#pragma once
-#include <string>
-#include <chrono>
-
-#include <SDL.h>
-
-#include <structures/Vector2.hpp>
-#include <structures/Color.hpp>
-#include <structures/Window.hpp>
-#include <graphics/RenderTarget.hpp>
-
-namespace sdlu
-{
-    // TODO: Probably break up into sdlu::Window and sdlu::Renderer
-    // to avoid passing around the Renderer when only the Window is
-    // needed. (See Mouse::GetPosition for example)
-
-    /**
-     * @brief A class that handles window related functionality
-     *
-     * A class that combines the SDL_Window and SDL_Renderer and
-     * behaves similar to the sf::RenderWindow from SFML. It provides
-     * utility and wrappers for common operations on those objects.
-     */
-    class RenderWindow : public Window, public RenderTarget
-    {
-    public:
-        /**
-         * @brief Default Constructor. No window or renderer is created.
-         */
-        RenderWindow();
-
-        /**
-         * @brief Creates a window and renderer with the given parameters
-         *
-         * @param[in] dimension A vector containing the width and height
-         * @param[in] title     The title of the create window
-         */
-        RenderWindow(Vector2u dimension, const std::string& title, 
-            Uint32 windowFlags = SDL_WINDOW_SHOWN);
-
-        RenderWindow(const RenderWindow& other) = delete;
-        RenderWindow(const RenderWindow&& other) = delete;
-
-        virtual ~RenderWindow();
-
-    protected:
-        /**
-         * @brief Function called after Window creation
-         */
-        virtual void OnCreate();
-
-        /**
-         * @brief Function called after resize event
-         *
-         * @return True if the resize event should not be returned via
-         *         PollEvent()
-         */
-        virtual bool OnResize();
-
-        /**
-         * @brief Function called after closing the window
-         */
-        virtual void OnClose();
-    };
-}
\ No newline at end of file
diff --git a/SDLU/graphics/drawable/CMakeLists.txt b/SDLU/graphics/drawable/CMakeLists.txt
deleted file mode 100644
index de69ad1..0000000
--- a/SDLU/graphics/drawable/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-target_sources(${PNAME} PRIVATE
-	${CMAKE_CURRENT_SOURCE_DIR}/Drawable.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Transformable.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Transformable.cpp
-)
-
-add_subdirectory( shapes )
\ No newline at end of file
diff --git a/SDLU/graphics/drawable/Drawable.hpp b/SDLU/graphics/drawable/Drawable.hpp
deleted file mode 100644
index 1cbe97e..0000000
--- a/SDLU/graphics/drawable/Drawable.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * @file Drawable.hpp
- * @brief The base class of everything renderable by RenderTarget
- * @author Lauchmelder23
- * @date 20.05.2020
- */
-#pragma once
-
-#include <SDL.h>
-
-namespace sdlu
-{
-    /** 
-     * @brief Everything that can be rendered derives from this class.
-     */
-    class Drawable
-    {
-    public:
-        Drawable(const Drawable& other) = delete;
-        Drawable(Drawable&& other) = delete;
-        Drawable& operator=(const Drawable& other) = delete;
-
-        friend class RenderTarget;
-
-    protected:
-        Drawable() { }
-        virtual void Draw(SDL_Renderer* const target) const = 0;
-    };
-}
\ No newline at end of file
diff --git a/SDLU/graphics/drawable/Transformable.cpp b/SDLU/graphics/drawable/Transformable.cpp
deleted file mode 100644
index 965c61a..0000000
--- a/SDLU/graphics/drawable/Transformable.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "Transformable.hpp"
-
-namespace sdlu
-{
-    Transformable::Transformable() :
-        position(0, 0), origin(0, 0),
-        scale(1.f, 1.f), rotation(0.f)
-    {
-        // Empty
-    }
-
-    Transformable::~Transformable()
-    {
-        // Empty
-    }
-
-    Vector2f Transformable::GetPosition()
-    {
-        return position;
-    }
-
-    void Transformable::SetPosition(const Vector2f& position)
-    {
-        this->position = position;
-    }
-
-    void Transformable::SetPosition(float x, float y)
-    {
-        position = Vector2f(x, y);
-    }
-
-    void Transformable::Move(const Vector2f& position)
-    {
-        this->position += position;
-    }
-
-    void Transformable::Move(float x, float y)
-    {
-        position += Vector2f(x, y);
-    }
-    Vector2f Transformable::GetOrigin()
-    {
-        return origin;
-    }
-
-    void Transformable::SetOrigin(const Vector2f& origin)
-    {
-        this->origin = origin;
-    }
-
-    void Transformable::SetOrigin(float x, float y)
-    {
-        origin = Vector2f(x, y);
-    }
-
-    Vector2f Transformable::GetScale()
-    {
-        return scale;
-    }
-
-    void Transformable::SetScale(const Vector2f& scale)
-    {
-        this->scale = scale;
-    }
-
-    void Transformable::SetScale(float x, float y)
-    {
-        scale = Vector2f(x, y);
-    }
-
-    void Transformable::Scale(const Vector2f& scale)
-    {
-        this->scale += scale;
-    }
-
-    void Transformable::Scale(float x, float y)
-    {
-        scale += Vector2f(x, y);
-    }
-
-    float Transformable::GetRotation()
-    {
-        return rotation;
-    }
-
-    void Transformable::SetRotation(float angle)
-    {
-        rotation = angle;
-    }
-
-    void Transformable::Rotate(float angle)
-    {
-        rotation += angle;
-    }
-}
\ No newline at end of file
diff --git a/SDLU/graphics/drawable/Transformable.hpp b/SDLU/graphics/drawable/Transformable.hpp
deleted file mode 100644
index cd9cd4b..0000000
--- a/SDLU/graphics/drawable/Transformable.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/**
- * @file Transformable.hpp
- * @brief Contains information for transformable objects
- * @author Lauchmelder23
- * @date 23.05.2020
- */
-#pragma once
-
-#include <structures/Vector2.hpp>
-
-namespace sdlu
-{
-    /**
-     * @brief A class that stores locational information
-     *
-     * Stores position, rotation, scale and the origin of an
-     * object, and provides functions to get/set those values.
-     * "Origin" is the offset between the position and the top-left
-     * corner of the object.
-     */
-    class Transformable
-    {
-    public:
-        /**
-         * @brief Default constructur
-         */
-        Transformable();
-
-        /**
-         * @brief Deconstructor
-         */
-        virtual ~Transformable();
-
-        /**
-         * @brief Returns the position of the object
-         *
-         * @return A 2D vector of the position
-         */
-        Vector2f GetPosition();
-
-        /**
-         * @brief Sets a new position
-         * 
-         * @param[in] position A 2D vector with the new position
-         */
-        void SetPosition(const Vector2f& position);
-
-        /**
-         * @brief Sets a new position
-         * 
-         * @param[in] x The new x position
-         * @param[in] y The new y position
-         */
-        void SetPosition(float x, float y);
-
-        /**
-         * @brief Adds to the current position
-         *
-         * @param[in] position A 2D movement vector
-         */
-        void Move(const Vector2f& position);
-
-        /**
-         * @brief Adds to the current position
-         *
-         * @param[in] x The offset in x direction
-         * @param[in] y The offset in y direction
-         */
-        void Move(float x, float y);
-
-
-        /**
-         * @brief Gets the current local origin
-         *
-         * @return A 2D vector with the offset
-         */
-        Vector2f GetOrigin();
-
-        /**
-         * @brief Sets a new local origin
-         *
-         * @param[in] origin A 2D vector with the new origin
-         */
-        void SetOrigin(const Vector2f& origin);
-
-        /**
-         * @brief Sets a new local origin
-         *
-         * @param[in] x The new x component of the origin
-         * @param[in] y The new y component of the origin
-         */
-        void SetOrigin(float x, float y);
-
-        /**
-         * @brief Gets the current scale of the object
-         * 
-         * @return A 2D vector with the scale in x- and y-direction
-         */
-        Vector2f GetScale();
-
-        /**
-         * @brief Sets a new scale
-         *
-         * @param[in] scale A 2D vector with the new scale
-         */
-        void SetScale(const Vector2f& scale);
-
-        /**
-         * @brief Sets a new scale
-         * 
-         * @param[in] x The new scale in x direction
-         * @param[in] y The new scale in y direction
-         */
-        void SetScale(float x, float y);
-
-        /**
-         * @brief Scales the object by some amount
-         *
-         * @param[in] scale The amount to scale by in x- and y-direction
-         */
-        void Scale(const Vector2f& scale);
-
-        /**
-         * @brief Scales the object by some amount
-         * 
-         * @param[in] x The amount to scale by in x direction
-         * @param[in] y The amount to scale by in y direction
-         */
-        void Scale(float x, float y);
-
-        /**
-         * @brief Gets the current rotation
-         *
-         * @return The rotation in degrees
-         */
-        float GetRotation();
-
-        /**
-         * @brief Sets a new rotation
-         *
-         * @param[in] angle The new rotation in degrees
-         */
-        void SetRotation(float angle);
-
-        /**
-         * @brief Rotates by some amount
-         *
-         * @param[in] angle The angle to rotate by in degrees
-         */
-        void Rotate(float angle);
-
-    protected:
-        Vector2f position; ///< Position of the object
-        Vector2f origin;   ///< Offset of the top-left corner from the position
-        Vector2f scale;    ///< Scale of the object
-        float rotation;    ///< Rotation of the object (in degrees)
-    };
-}
\ No newline at end of file
diff --git a/SDLU/graphics/drawable/shapes/CMakeLists.txt b/SDLU/graphics/drawable/shapes/CMakeLists.txt
deleted file mode 100644
index bb0acb3..0000000
--- a/SDLU/graphics/drawable/shapes/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-target_sources(${PNAME} PRIVATE
-	${CMAKE_CURRENT_SOURCE_DIR}/Shape.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Shape.cpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Rectangle.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Rectangle.cpp
-)
\ No newline at end of file
diff --git a/SDLU/graphics/drawable/shapes/Shape.hpp b/SDLU/graphics/drawable/shapes/Shape.hpp
deleted file mode 100644
index f8e03e6..0000000
--- a/SDLU/graphics/drawable/shapes/Shape.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * @file Shape.hpp
- * @brief The base class for all native SDLU shapes
- * @author Lauchmelder23
- * @date 23.05.2020
- */
-#pragma once
-
-#include <graphics/drawable/Drawable.hpp>
-#include <graphics/drawable/Transformable.hpp>
-#include <structures/Color.hpp>
-
-namespace sdlu
-{
-    /**
-     * @brief The non-instantiable base class for all SDLU shapes
-     */
-    class Shape :
-        public Drawable, public Transformable
-    {
-    public:
-        /**
-         * @brief Deconstructor
-         */
-        virtual ~Shape();
-
-        /**
-         * @brief Sets the color of the shape
-         */
-        void SetColor(const Color& color);
-
-        /**
-         * @brief Gets the color of the shape
-         */
-        Color GetColor();
-
-    protected:
-        /**
-         * @brief Default constructor
-         */
-        Shape();
-
-    protected:
-        Color color;
-    };
-}
\ No newline at end of file
diff --git a/SDLU/structures/CMakeLists.txt b/SDLU/structures/CMakeLists.txt
deleted file mode 100644
index 3a7198c..0000000
--- a/SDLU/structures/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-target_sources(${PNAME} PRIVATE
-	${CMAKE_CURRENT_SOURCE_DIR}/Vector2.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Color.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Color.cpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Mouse.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Mouse.cpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Window.hpp
-	${CMAKE_CURRENT_SOURCE_DIR}/Window.cpp
-)
\ No newline at end of file
diff --git a/SDLU/structures/Color.cpp b/SDLU/structures/Color.cpp
deleted file mode 100644
index 0861a30..0000000
--- a/SDLU/structures/Color.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "Color.hpp"
-
-#include <math.h>
-#include <cmath>
-
-namespace sdlu
-{
-    const Color Color::Black = Color(0, 0, 0);
-    const Color Color::Red = Color(255, 0, 0);
-    const Color Color::Green = Color(0, 255, 0);
-    const Color Color::Blue = Color(0, 0, 255);
-    const Color Color::Yellow = Color(255, 255, 0);
-    const Color Color::Magenta = Color(255, 0, 255);
-    const Color Color::Cyan = Color(0, 255, 255);
-    const Color Color::White = Color(255, 255, 255);
-
-    const Color Color::Transparent = Color(0, 0, 0, 0);
-
-    Color::Color() :
-        r(0), g(0), b(0), a(0)
-    {
-        // Empty
-    }
-
-    Color::Color(Uint8 r, Uint8 g, Uint8 b, Uint8 a) :
-        r(r), g(g), b(b), a(a)
-    {
-        // Empty
-    }
-
-    Color::Color(Uint32 color) : 
-        r((color & 0xFF000000) >> 24),
-        g((color & 0x00FF0000) >> 16),
-        b((color & 0x0000FF00) >> 8),
-        a((color & 0x000000FF))
-    {
-        // Empty
-    }
-
-    Uint32 Color::ToInt()
-    {
-        Uint32 color = 0;
-        color |= r << 24;
-        color |= g << 16;
-        color |= b << 8;
-        color |= a;
-        return color;
-    }
-
-    Color Color::FromHSV(Uint16 h, Uint8 s, Uint8 v)
-    {
-        // Normalize parameters
-        // H : [0, 360)
-        // S : [0, 1]
-        // V : [0, 1]
-        h -= std::floor(h / 360) * 360;
-        s = (s > 1) ? 1 : s;
-        v = (v > 1) ? 1 : v;
-
-        // Convert to RGBA
-        Uint16 H = std::floor(h / 60.f);
-        float f = (h / 60.f) - H;
-
-        Uint8 p = static_cast<Uint8>((v * (1 - s)) * 255);
-        Uint8 q = static_cast<Uint8>((v * (1 - s * f)) * 255);
-        Uint8 t = static_cast<Uint8>((v * (1 - s * (1 - f))) * 255);
-        v *= 255;
-
-        Color output;
-        switch (H)
-        {
-        case 0:
-        case 6:
-            output = Color(v, t, p);
-            break;
-        case 1:
-            output = Color(q, v, p);
-            break;
-        case 2:
-            output = Color(p, v, t);
-            break;
-        case 3:
-            output = Color(p, q, v);
-            break;
-        case 4:
-            output = Color(t, p, v);
-            break;
-        case 5:
-            output = Color(v, p, q);
-            break;
-        default:
-            break;
-        }
-
-        return output;
-    }
-
-    Color operator+(const Color& left, const Color& right)
-    {
-        return Color((UINT8_MAX - left.r) < right.r ? 255 : left.r + right.r,
-            (UINT8_MAX - left.g) < right.g ? 255 : left.g + right.g,
-            (UINT8_MAX - left.b) < right.b ? 255 : left.b + right.b,
-            (UINT8_MAX - left.a) < right.a ? 255 : left.a + right.a);
-    }
-
-    Color operator-(const Color& left, const Color& right)
-    {
-        return Color(left.r < right.r ? 0 : left.r - right.r,
-            left.g < right.g ? 0 : left.g - right.g,
-            left.b < right.b ? 0 : left.b - right.b,
-            left.a < right.a ? 0 : left.a - right.a);
-    }
-
-    Color operator*(const Color& left, const Color& right)
-    {
-        return Color((UINT8_MAX / left.r) < right.r ? 255 : left.r * right.r,
-            (UINT8_MAX / left.g) < right.g ? 255 : left.g * right.g,
-            (UINT8_MAX / left.b) < right.b ? 255 : left.b * right.b,
-            (UINT8_MAX / left.a) < right.a ? 255 : left.a * right.a);
-    }
-
-    Color operator/(const Color& left, const Color& right)
-    {
-        return Color(left.r / right.r,
-            left.g / right.g,
-            left.b / right.b,
-            left.a / right.a);
-    }
-
-    Color& operator+=(Color& left, const Color& right)
-    {
-        left = left + right;
-        return left;
-    }
-
-    Color& operator-=(Color& left, const Color& right)
-    {
-        left = left - right;
-        return left;
-    }
-
-    Color& operator*=(Color& left, const Color& right)
-    {
-        left = left * right;
-        return left;
-    }
-
-    Color& operator/=(Color& left, const Color& right)
-    {
-        left = left / right;
-        return left;
-    }
-
-    bool operator==(const Color& left, const Color& right)
-    {
-        return ((left.r == right.r) && (left.g == right.g) && (left.b == right.b) && (left.a == right.a));
-    }
-
-    bool operator!=(const Color& left, const Color& right)
-    {
-        return !(left == right);
-    }
-}
\ No newline at end of file
diff --git a/SDLU/structures/Mouse.cpp b/SDLU/structures/Mouse.cpp
deleted file mode 100644
index 1443d94..0000000
--- a/SDLU/structures/Mouse.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "Mouse.hpp"
-
-#include <SDL_mouse.h>
-
-namespace sdlu
-{
-    Uint32 Mouse::GetButtonState()
-    {
-        return SDL_GetMouseState(NULL, NULL);
-    }
-
-    bool Mouse::IsButtonDown(Button button)
-    {
-        return (GetButtonState() & SDL_BUTTON(button));
-    }
-
-    Vector2i Mouse::GetPosition()
-    {
-        int x = 0, y = 0;
-        SDL_GetGlobalMouseState(&x, &y);
-        return Vector2i(x, y);
-    }
-
-    Vector2i Mouse::GetPosition(const RenderWindow& relativeTo)
-    {
-        return GetPosition() - relativeTo.GetPosition();
-    }
-
-    void Mouse::SetPosition(const Vector2i& position)
-    {
-        SDL_WarpMouseGlobal(position.x, position.y);
-    }
-
-    void Mouse::SetPosition(const Vector2i& position, const RenderWindow& relativeTo)
-    {
-        SDL_WarpMouseInWindow(relativeTo.GetWindow(), position.x, position.y);
-    }
-}
\ No newline at end of file
diff --git a/SDLU/structures/Mouse.hpp b/SDLU/structures/Mouse.hpp
deleted file mode 100644
index adc2290..0000000
--- a/SDLU/structures/Mouse.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * @file Mouse.hpp
- * @brief A static class to provide easy handling of the mouse
- * @author Lauchmelder23
- * @date 19.05.2020
- */
-#pragma once
-#include <SDL_mouse.h>
-#include <structures/Vector2.hpp>
-#include <graphics/RenderWindow.hpp>
-
-namespace sdlu
-{
-    /**
-     * @brief A static class that contains/handles data about
-     *        mouse position and button states
-     */
-    class Mouse
-    {
-    public:
-        /**
-         * @brief Mouse buttons
-         */
-        enum Button {
-            Left = SDL_BUTTON_LEFT,
-            Right = SDL_BUTTON_RIGHT,
-            Middle = SDL_BUTTON_MIDDLE,
-            XButton1 = SDL_BUTTON_X1,
-            XButton2 = SDL_BUTTON_X2
-        };
-        
-        /**
-         * @brief Returns the current mouse button state
-         *
-         * @return A 32-bit mask of the current button state
-         */
-        static Uint32 GetButtonState();
-
-        /**
-         * @brief Checks if a specific button is pressed
-         *
-         * @param[in] button The button to check
-         * @return True if the button is pressed
-         */
-        static bool IsButtonDown(Button button);
-
-        /**
-         * @brief Gets the absolute position of the mouse
-         *
-         * @return Current mouse position relative to screen
-         */
-        static Vector2i GetPosition();
-
-        /**
-         * @brief Gets current relative position of the mouse
-         *
-         * @param[in] relativeTo The window the mouse position should be relative to
-         * @return The position of the mouse relative to the top left of the passed window object
-         */
-        static Vector2i GetPosition(const RenderWindow& relativeTo);
-
-        /**
-         * @brief Sets the absolute position of the mouse
-         *
-         * @param[in] position A 2D vector of the new position
-         */
-        static void SetPosition(const Vector2i& position);
-
-        /**
-         * @brief Sets current relative position of the mouse
-         *
-         * @param[in] position A 2D vector of the new position
-         * @param[in] relativeTo The window the mouse position should be relative to
-         */
-        static void SetPosition(const Vector2i& position, const RenderWindow& relativeTo);
-    };
-}
\ No newline at end of file
diff --git a/SDLU/structures/Vector2.hpp b/SDLU/structures/Vector2.hpp
deleted file mode 100644
index 8e6d792..0000000
--- a/SDLU/structures/Vector2.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/**
- * @file Vector2.hpp
- * @brief Provides a structure for simple vector calculations
- * @author Lauchmelder23
- * @date 16.05.2020
- */
-#pragma once
-#include <type_traits>
-
-namespace sdlu
-{
-    /**
-     * @brief A struct to handle basic 2D vector operations.
-     *
-     * @tparam T The (arithmetical) type of the vector components
-     */
-    template<
-        typename T,
-        typename = typename std::enable_if<std::is_arithmetic<T>::value, T>::type
-    > struct Vector2
-    {
-        T x; ///< x component
-        T y; ///< y component
-
-        //################## CONSTRUCTORS ##################//
-
-        /// Initializes a zero vector
-        Vector2() :
-            x(0), y(0)
-        { 
-            // Empty
-        }
-
-        /// Initializes a vector with default values
-        Vector2(T x, T y) :
-            x(x), y(y)
-        {
-            // Empty
-        }
-
-        /// Copies the components of a vector
-        Vector2(const Vector2<T>& other) :
-            x(other.x), y(other.y)
-        {
-            // Empty
-        }
-
-
-        //################## OPERATORS ##################//
-
-        friend Vector2<T> operator-(const Vector2<T>& right)
-        {
-            return Vector2<T>(-right.x, -right.y);
-        }
-
-        friend Vector2<T> operator+(const Vector2<T>& left, const Vector2<T>& right)
-        {
-            return Vector2<T>(left.x + right.x, left.y + right.y);
-        }
-
-        friend Vector2<T> operator-(const Vector2<T>& left, const Vector2<T>& right)
-        {
-            return left + (-right);
-        }
-
-        friend Vector2<T> operator*(const Vector2<T>& left, const Vector2<T>& right)
-        {
-            return Vector2<T>(left.x * right.x, left.y * right.y);
-        }
-
-        friend Vector2<T> operator/(const Vector2<T>& left, const Vector2<T>& right)
-        {
-            return Vector2<T>(left.x / right.x, left.y / right.y);
-        }
-
-        friend Vector2<T> operator*(T left, const Vector2<T>& right)
-        {
-            return Vector2<T>(left * right.x, left * right.y);
-        }
-
-        friend Vector2<T> operator*(const Vector2<T>& left, T right)
-        {
-            return right * left;
-        }
-
-        friend Vector2<T> operator/(const Vector2<T>& left, T right)
-        {
-            return Vector2<T>(left.x / right, left.y / right);
-        }
-
-        friend Vector2<T>& operator+=(Vector2<T>& left, const Vector2<T>& right)
-        {
-            left.x += right.x;
-            left.y += right.y;
-            return left;
-        }
-
-        friend Vector2<T>& operator-=(Vector2<T>& left, const Vector2<T>& right)
-        {
-            left += (-right);
-            return left;
-        }
-
-        friend Vector2<T>& operator*=(Vector2<T>& left, const Vector2<T>& right)
-        {
-            left.x *= right.x;
-            left.y *= right.y;
-            return left;
-        }
-
-        friend Vector2<T>& operator/(Vector2<T>& left, const Vector2<T>& right)
-        {
-            left.x /= right.x;
-            left.y /= right.y;
-            return left;
-        }
-
-        friend Vector2<T>& operator*=(Vector2<T>& left, T right)
-        {
-            left.x *= right;
-            left.y *= right;
-            return left;
-        }
-
-        friend Vector2<T>& operator/=(Vector2<T>& left, T right)
-        {
-            left.x /= right;
-            left.y /= right;
-            return left;
-        }
-
-        friend bool operator==(const Vector2<T>& left, const Vector2<T>& right)
-        {
-            return ((left.x == right.x) && (left.y == right.y));
-        }
-
-        friend bool operator!=(const Vector2<T>& left, const Vector2<T>& right)
-        {
-            return !(left == right);
-        }
-    };
-
-
-    
-
-    //################## TYPEDEFS ##################//
-
-    typedef Vector2<unsigned int>   Vector2u, Vec2u;
-    typedef Vector2<int>            Vector2i, Vec2i;
-    typedef Vector2<float>          Vector2f, Vec2f;
-    typedef Vector2<double>         Vector2d, Vec2d;
-}
\ No newline at end of file
diff --git a/SDLU/structures/Window.cpp b/SDLU/structures/Window.cpp
deleted file mode 100644
index 15c2e8b..0000000
--- a/SDLU/structures/Window.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-#include "Window.hpp"
-
-#include <cstring>
-
-#include <exceptions/Exceptions.hpp>
-
-namespace sdlu
-{
-    Window::Window() :
-        window(nullptr)
-    {
-        // Empty
-    }
-
-    Window::Window(Vector2u dimension, const std::string& title, Uint32 windowFlags) :
-        Window()
-    {
-        Create(dimension, title, windowFlags);
-    }
-
-    Window::~Window()
-    {
-        Close();
-    }
-
-    void Window::Create(Vector2u dimension, const std::string& title, Uint32 windowFlags)
-    {
-        // Don't create a window when it already exists
-        RETURN_IF_NOT_NULLPTR(window);
-
-        window = SDL_CreateWindow(title.c_str(),
-            SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
-            dimension.x, dimension.y,
-            windowFlags);
-
-        THROW_IF(IS_NULLPTR(window),
-            ObjectCreationException("Failed to create SDL_Window. \nSDL_GetError(): " + std::string(SDL_GetError())));
-
-        OnCreate();
-    }
-
-    void Window::Close()
-    {
-        // Don't destroy a window that doesn't exist
-        RETURN_IF_NULLPTR(window);
-
-        SDL_DestroyWindow(window);
-        window = nullptr;
-
-        OnClose();
-    }
-
-    bool Window::IsOpen() const
-    {
-        RETURN_IF_NULLPTR(window, false);
-        return (!SDL_GetWindowID(window) ? false : true);
-    }
-
-    bool Window::PollEvent(SDL_Event* event)
-    {
-        RETURN_IF_NULLPTR(window, false);
-        // Handle events before the user in case a derived
-        // class decides to block the event.
-        while (SDL_PollEvent(event))
-        {
-            switch (event->window.event)
-            {
-            case SDL_WINDOWEVENT_RESIZED: if (!OnResize()) return true; break;
-            default: return true;
-            }
-        }
-
-        event = NULL;
-        return false;
-    }
-
-    bool Window::WaitEvent(SDL_Event* event)
-    {
-        while (!PollEvent(event)) continue;
-        return true;
-    }
-
-    Vector2i Window::GetPosition() const
-    {
-        RETURN_IF_NULLPTR(window, Vector2i());
-
-        int x = 0, y = 0;
-        SDL_GetWindowPosition(window, &x, &y);
-        return Vector2i(x, y);
-    }
-
-    void Window::SetPosition(Vector2i position)
-    {
-        RETURN_IF_NULLPTR(window);
-
-        SDL_SetWindowPosition(window, position.x, position.y);
-    }
-
-    void Window::SetPosition(int x, int y)
-    {
-        RETURN_IF_NULLPTR(window);
-
-        SDL_SetWindowPosition(window, x, y);
-    }
-
-    Vector2u Window::GetSize() const
-    {
-        RETURN_IF_NULLPTR(window, Vector2u());
-
-        int x = 0, y = 0;
-        SDL_GetWindowSize(window, &x, &y);
-        return Vector2u(x, y);
-    }
-
-    void Window::SetSize(Vector2u size)
-    {
-        RETURN_IF_NULLPTR(window);
-
-        SDL_SetWindowSize(window, size.x, size.y);
-    }
-
-    void Window::SetSize(unsigned int width, unsigned int height)
-    {
-        RETURN_IF_NULLPTR(window);
-
-        SDL_SetWindowSize(window, width, height);
-    }
-
-    std::string Window::GetTitle() const
-    {
-        RETURN_IF_NULLPTR(window, "");
-
-        return SDL_GetWindowTitle(window);
-    }
-
-    void Window::SetTitle(std::string title)
-    {
-        RETURN_IF_NULLPTR(window);
-
-        SDL_SetWindowTitle(window, title.c_str());
-    }
-
-    SDL_Window* const Window::GetWindow() const
-    {
-        return window;
-    }
-
-    void Window::SetVisible(bool visible)
-    {
-        RETURN_IF_NULLPTR(window);
-        if (visible)
-            SDL_ShowWindow(window);
-        else
-            SDL_HideWindow(window);
-    }
-
-    void Window::SetVsync(bool vsync)
-    {
-        // SDL actually doesn't allow you to change the VSync
-        // flag of a Renderer after it's been created. This
-        // Changes it globally for all other windows
-        SDL_GL_SetSwapInterval(vsync);
-    }
-
-    void Window::SetMouseCursorVisible(bool visible)
-    {
-        SDL_ShowCursor(visible);
-    }
-
-    void Window::SetMouseCursorGrabbed(bool grabbed)
-    {
-        SDL_SetWindowGrab(window, grabbed ? SDL_TRUE : SDL_FALSE);
-    }
-
-    void Window::SetIcon(Uint32 width, Uint32 height, const Uint8* pixels)
-    {
-        size_t size = static_cast<size_t>(width) * static_cast<size_t>(height) * 4;
-        void* _pixels = malloc(size);
-        memcpy(_pixels, pixels, size);
-        SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormatFrom(_pixels,
-            width, height, 32, 32 * width,
-            SDL_PIXELFORMAT_RGBA8888);
-
-        SDL_SetWindowIcon(window, surface);
-    }
-
-    void Window::SetIcon(Uint32 width, Uint32 height, const Uint32* pixels)
-    {
-        size_t size = static_cast<size_t>(width) * static_cast<size_t>(height) * 4;
-        void* _pixels = malloc(size);
-        memcpy(_pixels, pixels, size);
-        SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormatFrom(_pixels,
-            width, height, 32, 4 * width,
-            SDL_PIXELFORMAT_RGBA8888);
-
-        SDL_SetWindowIcon(window, surface);
-    }
-
-    void Window::SetIcon(SDL_Surface* icon)
-    {
-        SDL_SetWindowIcon(window, icon);
-    }
-
-    void Window::SetMouseCursor(SDL_Cursor* cursor)
-    {
-        SDL_SetCursor(cursor);
-    }
-
-    void Window::SetMouseCursor(SDL_SystemCursor cursor)
-    {
-        SDL_Cursor* _cursor = SDL_CreateSystemCursor(cursor);
-        SDL_SetCursor(_cursor);
-    }
-
-    void Window::SetMouseCursor(SDL_Surface* surface, Vector2u clickspot)
-    {
-        SDL_Cursor* _cursor = SDL_CreateColorCursor(surface, clickspot.x, clickspot.y);
-        SDL_SetCursor(_cursor);
-    }
-
-    void Window::SetMouseCursor(const Uint8* pixels, Vector2u size, Vector2u clickspot)
-    {
-        size_t _size = static_cast<size_t>(size.x) * static_cast<size_t>(size.y) * 4;
-        void* _pixels = malloc(_size);
-        memcpy(_pixels, pixels, _size);
-        SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormatFrom(_pixels,
-            size.x, size.y, 32, 8 * size.x, SDL_PIXELFORMAT_RGBA8888);
-        this->SetMouseCursor(surface, clickspot);
-    }
-
-    void Window::SetMouseCursor(const Uint32* pixels, Vector2u size, Vector2u clickspot)
-    {
-        size_t _size = static_cast<size_t>(size.x) * static_cast<size_t>(size.y) * 4;
-        void* _pixels = malloc(_size);
-        memcpy(_pixels, pixels, _size);
-        SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormatFrom(_pixels,
-            size.x, size.y, 32, 8 * size.x, SDL_PIXELFORMAT_RGBA32);
-        this->SetMouseCursor(surface, clickspot);
-    }
-
-    void Window::OnCreate()
-    {
-    }
-
-    bool Window::OnResize()
-    {
-        return false;
-    }
-
-    void Window::OnClose()
-    {
-    }
-}
\ No newline at end of file
diff --git a/SDLU/structures/Window.hpp b/SDLU/structures/Window.hpp
deleted file mode 100644
index 1d27661..0000000
--- a/SDLU/structures/Window.hpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/**
- * @file Window.hpp
- * @brief Contains window related objects
- * @author Lauchmelder23
- * @date 20.05.20
- */
-#pragma once
-#include <string>
-#include <chrono>
-
-#include <SDL.h>
-
-#include <Util.hpp>
-#include <structures/Vector2.hpp>
-
-
-namespace sdlu
-{
-    /**
-     * @brief Stores information about a window. You probably want RenderWindow.
-     */
-    class Window
-    {
-    public:
-        /**
-         * @brief Default Constructor. No window is created.
-         */
-        Window();
-
-        /**
-         * @brief Creates a window with the given parameters
-         *
-         * @param[in] dimension A vector containing the width and height
-         * @param[in] title     The title of the create window
-         */
-        Window(Vector2u dimension, const std::string& title,
-            Uint32 windowFlags);
-
-        Window(const Window& other) = delete;
-        Window(Window&& other) = delete;
-
-        virtual ~Window();
-
-        /**
-         * @brief Creates the window.
-         *
-         * This function creates the SDL_Window object. If
-         * they were already created the function does nothing and returns.
-         * If it fails to create either, an ObjectCreationException is thrown.
-         *
-         * @param[in] dimension A vector containing the width and height
-         * @param[in] title     The title of the create window
-         */
-        void Create(Vector2u dimension, const std::string& title,
-            Uint32 windowFlags);
-
-        /**
-         * @brief Destroys the window.
-         */
-        void Close();
-
-        /**
-         * @brief Wether or not the window object is created
-         *
-         * @return True if the window is open, False if not
-         */
-        bool IsOpen() const;
-
-        /**
-         * @brief A non-blocking event polling function
-         *
-         * @param[out] event An object to write the latest event to
-         * @return True if there was an event, False if there wasn't
-         */
-        bool PollEvent(SDL_Event* event);
-
-        /**
-         * @brief A blocking event polling function
-         *
-         * @param[out] event An object to write the latest event to
-         * @return True if an event was polled
-         */
-        bool WaitEvent(SDL_Event* event);
-
-
-        /**
-         * @brief Returns the current position of the window
-         *
-         * @return A vector with the current position relative to the top left corner of the display
-         */
-        Vector2i GetPosition() const;
-
-        /**
-         * @brief Sets a new window position
-         *
-         * @param[in] position A vector with the new position
-         */
-        void SetPosition(Vector2i position);
-
-        /**
-         * @brief Sets a new window position
-         *
-         * @param[in] x The new x position
-         * @param[in] y The new y position
-         */
-        void SetPosition(int x, int y);
-
-
-        /**
-         * @brief Gets the current window size
-         *
-         * @return A vector with the windows size
-         */
-        Vector2u GetSize() const;
-
-        /**
-         * @brief Sets a new window size
-         *
-         * @param[in] size A vector with the new size
-         */
-        void SetSize(Vector2u size);
-
-        /**
-         * @brief Sets a new window size
-         *
-         * @param[in] width  The new width of the window
-         * @param[in] height The new height of the window
-         */
-        void SetSize(unsigned int width, unsigned int height);
-
-
-        /**
-         * @brief Gets the current window title
-         *
-         * @return The title of the widnow
-         */
-        std::string GetTitle() const;
-
-        /**
-         * @brief Sets a new window title
-         *
-         * @param[in] title The new window title
-         */
-        void SetTitle(std::string title);
-
-        /**
-         * @brief Returns a constant pointer to the SDL_Window
-         *
-         * @return A constant pointer to SDL_Window
-         */
-        SDL_Window* const GetWindow() const;
-
-        /**
-         * @brief Set the windows visibility
-         *
-         * @param[in] visible The new visibility setting
-         */
-        void SetVisible(bool visible);
-
-        /**
-         * @brief (De)activates VSync !globally!
-         *
-         * @param[in] vsync Wether to enable or disable vsync
-         */
-        void SetVsync(bool vsync);
-
-        /**
-         * @brief Hides/Shows the mouse cursor inside the windos
-         *
-         * @param[in] visible The new visibility of the cursor
-         */
-        void SetMouseCursorVisible(bool visible);
-
-        /**
-         * @brief Traps the mouse cursor inside the window
-         *
-         * @param[in] grabbed Wether to (un)trap the cursor
-         */
-        void SetMouseCursorGrabbed(bool grabbed);
-
-        /**
-         * @brief Sets the window icon to an array of RGBA values
-         *
-         * @param[in] width  Width of the icon (in px)
-         * @param[in] height Height of the icon (in px)
-         * @param[in] pixels Array of color data (RGBA as seperate 8-Bit integer values)
-         */
-        void SetIcon(Uint32 width, Uint32 height, const Uint8* pixels);
-
-        /**
-         * @brief Sets the window icon to an array of RGBA values
-         *
-         * @param[in] width  Width of the icon (in px)
-         * @param[in] height Height of the icon (in px)
-         * @param[in] pixels Array of color data (RGBA as one 32-Bit integer value)
-         */
-        void SetIcon(Uint32 width, Uint32 height, const Uint32* pixels);
-
-        /**
-         * @brief Sets the window icon to a SDL_Surface
-         *
-         * @param[in] icon A SDL_Surface* holding the icon data
-         */
-        void SetIcon(SDL_Surface* icon);
-
-        /**
-         * @brief Changes the mouse cursor
-         *
-         * @param[in] cursor A pointer to a SDL_Cursor containing cursor data
-         */
-        void SetMouseCursor(SDL_Cursor* cursor);
-
-        /**
-         * @brief Changes the mouse cursor
-         *
-         * @param[in] cursor An enum for a system cursor
-         */
-        void SetMouseCursor(SDL_SystemCursor cursor);
-
-        /**
-         * @brief Changes the mouse cursor
-         *
-         * @param[in] surface   A pointer to a SDL_Surface containing sprite data
-         * @param[in] clickspot The effective position of the cursor relative to the top left of the sprite
-         */
-        void SetMouseCursor(SDL_Surface* surface, Vector2u clickspot);
-
-        /**
-         * @brief Changes the mouse cursor
-         *
-         * @param[in] pixels    An array of color data (RGBA as seperate 8-bit values)
-         * @param[in] size      Size of the cursor
-         * @param[in] clickspot The effective position of the cursor relative to the top left of the sprite
-         */
-        void SetMouseCursor(const Uint8* pixels, Vector2u size, Vector2u clickspot);
-
-        /**
-         * @brief Changes the mouse cursor
-         *
-         * @param[in] pixels    An array of color data (RGBA as one 32-bit value)
-         * @param[in] size      Size of the cursor
-         * @param[in] clickspot The effective position of the cursor relative to the top left of the sprite
-         */
-        void SetMouseCursor(const Uint32* pixels, Vector2u size, Vector2u clickspot);
-
-    protected:
-        SDL_Window* window;
-
-    protected:
-        /**
-        * @brief This function is called after Create() finishes
-        */
-        virtual void OnCreate();
-
-        /**
-            * @brief This function is called after a SDL_WINDOWEVENT_RESIZED is polled.
-            *        (PollEvent() must be called for this to work)
-            *
-            * @return True if the resize event should be popped from the event queue before
-                    returning the polled event to the user
-            */
-        virtual bool OnResize();
-
-        /**
-            * @brief This function is called after Close() finishes.
-            */
-        virtual void OnClose();
-    };
-}
\ No newline at end of file
diff --git a/SDLU_Example/CMakeLists.txt b/SDLU_Example/CMakeLists.txt
deleted file mode 100644
index a27877a..0000000
--- a/SDLU_Example/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-set(PNAME sdlu_example)
-
-add_executable(${PNAME}
-	main.cpp header.hpp
-)
-
-target_include_directories(${PNAME} PRIVATE
-	${PROJECT_SOURCE_DIR}/SDLU
-	${PROJECT_SOURCE_DIR}/3rdparty/SDL/include
-)
-
-target_link_libraries(${PNAME}
-	SDLU
-	SDL2
-	SDL2main
-)
-
-add_custom_command(TARGET ${PNAME}
-	POST_BUILD
-	COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:SDL2> $<TARGET_FILE_DIR:sdlu_example>
-)
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..65de7aa
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_executable(sdlu_example
+	main.cpp header.hpp
+)
+
+target_include_directories(sdlu_example PUBLIC
+	sdlu
+)
+
+target_link_libraries(sdlu_example
+	sdlu
+)
+
+# add_custom_command(TARGET sdlu_example POST_BUILD
+	# COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:SDL2> $<TARGET_FILE_DIR:sdlu_example>
+# )
\ No newline at end of file
diff --git a/SDLU_Example/header.hpp b/examples/header.hpp
similarity index 91%
rename from SDLU_Example/header.hpp
rename to examples/header.hpp
index e9fa40b..6d7247b 100644
--- a/SDLU_Example/header.hpp
+++ b/examples/header.hpp
@@ -1,7 +1,6 @@
-#include <SDLU.hpp>
 #pragma once
-#include "SDLU.hpp"
 #include <iostream>
+#include <SDLU.hpp>
 
 class MyWindow :
     public sdlu::RenderWindow
@@ -9,7 +8,7 @@ class MyWindow :
 public:
     MyWindow(Uint32 width, Uint32 height, const char* title) :
         RenderWindow(sdlu::Vector2u(width, height), title, 
-            SDL_WINDOW_RESIZABLE)
+            32)
     {
         // Empty
     }
diff --git a/SDLU_Example/main.cpp b/examples/main.cpp
similarity index 93%
rename from SDLU_Example/main.cpp
rename to examples/main.cpp
index 2844e0d..896b693 100644
--- a/SDLU_Example/main.cpp
+++ b/examples/main.cpp
@@ -1,11 +1,13 @@
 #include "header.hpp"
 
+#include <SDL_events.h>
+
 #include <math.h>
 #include <cmath>
 
 int main(int argc, char** argv)
 {
-    SDL_Init(SDL_INIT_VIDEO);
+    sdlu::Initialize();
 
     Uint32* icon_data = new Uint32[64 * 64];
     for (int y = 0; y < 64; y++)
@@ -22,10 +24,10 @@ int main(int argc, char** argv)
     Uint64 diff = 1;
 
     MyWindow window(800, 800, "Test");
-    SDL_SetWindowTitle(window.GetWindow(), "New Title");
+    window.SetTitle("New Title");
 
     window.SetIcon(64, 64, icon_data);
-    window.SetMouseCursor(SDL_SYSTEM_CURSOR_CROSSHAIR);
+    // window.SetMouseCursor(SDL_SYSTEM_CURSOR_CROSSHAIR);
     window.SetMaxFramerate(144);
 
     SDL_Event event;
@@ -73,7 +75,6 @@ int main(int argc, char** argv)
         start = std::chrono::steady_clock::now();
     }
 
-    SDL_Quit();
-
+    sdlu::Quit();
     return 0;
 }
\ No newline at end of file
diff --git a/include/SDLU.hpp b/include/SDLU.hpp
new file mode 100644
index 0000000..cb049e6
--- /dev/null
+++ b/include/SDLU.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <graphics/Graphics.hpp>
+#include <structures/Mouse.hpp>
+
+namespace sdlu {
+	// TODO: Eventually we should initialize things once the object gets created
+	extern int Initialize();
+	extern void Quit();
+}
\ No newline at end of file
diff --git a/SDLU/Util.hpp b/include/Util.hpp
similarity index 69%
rename from SDLU/Util.hpp
rename to include/Util.hpp
index bff7acf..1d4a087 100644
--- a/SDLU/Util.hpp
+++ b/include/Util.hpp
@@ -24,4 +24,10 @@ typedef uint32_t Uint32;
 typedef int32_t  Int32;
 
 typedef uint64_t Uint64;
-typedef int64_t  Int64;
\ No newline at end of file
+typedef int64_t  Int64;
+
+#define THROW_IF( condition, exception ) ( condition ? throw exception : false)
+#define THROW_IF_NOT( condition, exception ) ( THROW_IF(!condition, exception) )
+
+#define SDLU_BEGIN	namespace sdlu {
+#define SDLU_END	}
\ No newline at end of file
diff --git a/SDLU/graphics/Graphics.hpp b/include/graphics/Graphics.hpp
similarity index 100%
rename from SDLU/graphics/Graphics.hpp
rename to include/graphics/Graphics.hpp
diff --git a/include/graphics/RenderTarget.hpp b/include/graphics/RenderTarget.hpp
new file mode 100644
index 0000000..a394322
--- /dev/null
+++ b/include/graphics/RenderTarget.hpp
@@ -0,0 +1,79 @@
+/**
+ * @file RenderTarget
+ * @brief Contains rendering related objects
+ * @author Lauchmelder23
+ * @date 20.05.2020
+ */
+#pragma once
+#include <chrono>
+
+#include "structures/Color.hpp"
+#include "graphics/drawable/Drawable.hpp"
+
+struct SDL_Window;
+struct SDL_Surface;
+
+SDLU_BEGIN
+/**
+    * @brief Acts as a wrapper for SDL_Renderer*. You can't (and shouldn't)
+    *        instantiate this, but rather derive from it.
+    */ 
+class RenderTarget
+{
+public:
+    virtual ~RenderTarget();
+
+    /**
+        * @brief Clears the display
+        *
+        * @param[in] color The color to clear the display with
+        */
+    void Clear(const Color& color = Color::Black);
+
+    /**
+        * @brief Draws a sdlu::Drawable to the SDL_Renderer
+        *
+        * @param[in] drawable A reference to a derived class of Drawable
+        */
+    void Draw(const Drawable& drawable);
+
+    /**
+        * @brief Display the current state of the renderer to the screen
+        */
+    void Display();
+
+    /**
+        * @brief Sets a maximum framerate on the display function
+        *
+        * If the maximum framerate is not 0, SDL_Delay() will be called
+        * after each Display() to ensure that the time between displays
+        * is not shorter than the framerate limit.
+        *
+        * @param[in] max The new maximum framerate
+        */
+    void SetMaxFramerate(Uint32 max);
+
+protected:
+    /**
+        * @brief Create Renderer and bind it to a window
+        * 
+        * @param[in] target The SDL_Window to bind to
+        */
+    RenderTarget(SDL_Window* target);
+
+    /**
+        * @brief Create Renderer and bind it to a texture
+        *
+        * @param[in] target The SDL_Surface to bind to
+        */
+    RenderTarget(SDL_Surface* target);
+
+protected:
+    SDL_Renderer* renderer; ///< The renderer object
+
+private:
+    Uint32 m_oFramerate; ///< The current maximum framerate of the window (0 = unlimited)
+
+    std::chrono::steady_clock::time_point m_oTimeSinceLastDisplay; ///< The timepoint at which Display() was last called
+};
+SDLU_END
\ No newline at end of file
diff --git a/include/graphics/RenderWindow.hpp b/include/graphics/RenderWindow.hpp
new file mode 100644
index 0000000..ef20162
--- /dev/null
+++ b/include/graphics/RenderWindow.hpp
@@ -0,0 +1,69 @@
+/**
+ * @file RenderWindow.hpp
+ * @brief A wrapper around SDL_Window and SDL_Renderer
+ * @author Lauchmelder23
+ * @date 16.05.2020
+ */
+#pragma once
+#include <string>
+#include <chrono>
+
+#include "structures/Vector2.hpp"
+#include "structures/Color.hpp"
+#include "structures/Window.hpp"
+#include "graphics/RenderTarget.hpp"
+
+SDLU_BEGIN
+// TODO: Probably break up into sdlu::Window and sdlu::Renderer
+// to avoid passing around the Renderer when only the Window is
+// needed. (See Mouse::GetPosition for example)
+
+/**
+    * @brief A class that handles window related functionality
+    *
+    * A class that combines the SDL_Window and SDL_Renderer and
+    * behaves similar to the sf::RenderWindow from SFML. It provides
+    * utility and wrappers for common operations on those objects.
+    */
+class RenderWindow : public Window, public RenderTarget
+{
+public:
+    /**
+        * @brief Default Constructor. No window or renderer is created.
+        */
+    RenderWindow();
+
+    /**
+        * @brief Creates a window and renderer with the given parameters
+        *
+        * @param[in] dimension A vector containing the width and height
+        * @param[in] title     The title of the create window
+        */
+    RenderWindow(Vector2u dimension, const std::string& title, 
+        Uint32 windowFlags = Window::Flags::Shown);
+
+    RenderWindow(const RenderWindow& other) = delete;
+    RenderWindow(const RenderWindow&& other) = delete;
+
+    virtual ~RenderWindow();
+
+protected:
+    /**
+        * @brief Function called after Window creation
+        */
+    virtual void OnCreate();
+
+    /**
+        * @brief Function called after resize event
+        *
+        * @return True if the resize event should not be returned via
+        *         PollEvent()
+        */
+    virtual bool OnResize();
+
+    /**
+        * @brief Function called after closing the window
+        */
+    virtual void OnClose();
+};
+SDLU_END
\ No newline at end of file
diff --git a/include/graphics/drawable/Drawable.hpp b/include/graphics/drawable/Drawable.hpp
new file mode 100644
index 0000000..d442dee
--- /dev/null
+++ b/include/graphics/drawable/Drawable.hpp
@@ -0,0 +1,30 @@
+/**
+ * @file Drawable.hpp
+ * @brief The base class of everything renderable by RenderTarget
+ * @author Lauchmelder23
+ * @date 20.05.2020
+ */
+#pragma once
+
+#include "Util.hpp"
+
+struct SDL_Renderer;
+
+SDLU_BEGIN
+/** 
+    * @brief Everything that can be rendered derives from this class.
+    */
+class Drawable
+{
+public:
+    Drawable(const Drawable& other) = delete;
+    Drawable(Drawable&& other) = delete;
+    Drawable& operator=(const Drawable& other) = delete;
+
+    friend class RenderTarget;
+
+protected:
+    Drawable() { }
+    virtual void Draw(SDL_Renderer* const target) const = 0;
+};
+SDLU_END
\ No newline at end of file
diff --git a/include/graphics/drawable/Transformable.hpp b/include/graphics/drawable/Transformable.hpp
new file mode 100644
index 0000000..1381c8f
--- /dev/null
+++ b/include/graphics/drawable/Transformable.hpp
@@ -0,0 +1,157 @@
+/**
+ * @file Transformable.hpp
+ * @brief Contains information for transformable objects
+ * @author Lauchmelder23
+ * @date 23.05.2020
+ */
+#pragma once
+
+#include "structures/Vector2.hpp"
+
+SDLU_BEGIN
+/**
+    * @brief A class that stores locational information
+    *
+    * Stores position, rotation, scale and the origin of an
+    * object, and provides functions to get/set those values.
+    * "Origin" is the offset between the position and the top-left
+    * corner of the object.
+    */
+class Transformable
+{
+public:
+    /**
+        * @brief Default constructur
+        */
+    Transformable();
+
+    /**
+        * @brief Deconstructor
+        */
+    virtual ~Transformable();
+
+    /**
+        * @brief Returns the position of the object
+        *
+        * @return A 2D vector of the position
+        */
+    Vector2f GetPosition();
+
+    /**
+        * @brief Sets a new position
+        * 
+        * @param[in] position A 2D vector with the new position
+        */
+    void SetPosition(const Vector2f& position);
+
+    /**
+        * @brief Sets a new position
+        * 
+        * @param[in] x The new x position
+        * @param[in] y The new y position
+        */
+    void SetPosition(float x, float y);
+
+    /**
+        * @brief Adds to the current position
+        *
+        * @param[in] position A 2D movement vector
+        */
+    void Move(const Vector2f& position);
+
+    /**
+        * @brief Adds to the current position
+        *
+        * @param[in] x The offset in x direction
+        * @param[in] y The offset in y direction
+        */
+    void Move(float x, float y);
+
+
+    /**
+        * @brief Gets the current local origin
+        *
+        * @return A 2D vector with the offset
+        */
+    Vector2f GetOrigin();
+
+    /**
+        * @brief Sets a new local origin
+        *
+        * @param[in] origin A 2D vector with the new origin
+        */
+    void SetOrigin(const Vector2f& origin);
+
+    /**
+        * @brief Sets a new local origin
+        *
+        * @param[in] x The new x component of the origin
+        * @param[in] y The new y component of the origin
+        */
+    void SetOrigin(float x, float y);
+
+    /**
+        * @brief Gets the current scale of the object
+        * 
+        * @return A 2D vector with the scale in x- and y-direction
+        */
+    Vector2f GetScale();
+
+    /**
+        * @brief Sets a new scale
+        *
+        * @param[in] scale A 2D vector with the new scale
+        */
+    void SetScale(const Vector2f& scale);
+
+    /**
+        * @brief Sets a new scale
+        * 
+        * @param[in] x The new scale in x direction
+        * @param[in] y The new scale in y direction
+        */
+    void SetScale(float x, float y);
+
+    /**
+        * @brief Scales the object by some amount
+        *
+        * @param[in] scale The amount to scale by in x- and y-direction
+        */
+    void Scale(const Vector2f& scale);
+
+    /**
+        * @brief Scales the object by some amount
+        * 
+        * @param[in] x The amount to scale by in x direction
+        * @param[in] y The amount to scale by in y direction
+        */
+    void Scale(float x, float y);
+
+    /**
+        * @brief Gets the current rotation
+        *
+        * @return The rotation in degrees
+        */
+    float GetRotation();
+
+    /**
+        * @brief Sets a new rotation
+        *
+        * @param[in] angle The new rotation in degrees
+        */
+    void SetRotation(float angle);
+
+    /**
+        * @brief Rotates by some amount
+        *
+        * @param[in] angle The angle to rotate by in degrees
+        */
+    void Rotate(float angle);
+
+protected:
+    Vector2f position; ///< Position of the object
+    Vector2f origin;   ///< Offset of the top-left corner from the position
+    Vector2f scale;    ///< Scale of the object
+    float rotation;    ///< Rotation of the object (in degrees)
+};
+SDLU_END
\ No newline at end of file
diff --git a/SDLU/graphics/drawable/shapes/Rectangle.hpp b/include/graphics/drawable/shapes/Rectangle.hpp
similarity index 98%
rename from SDLU/graphics/drawable/shapes/Rectangle.hpp
rename to include/graphics/drawable/shapes/Rectangle.hpp
index fea0612..8115fae 100644
--- a/SDLU/graphics/drawable/shapes/Rectangle.hpp
+++ b/include/graphics/drawable/shapes/Rectangle.hpp
@@ -8,8 +8,6 @@
 
 #include "Shape.hpp"
 
-#include <SDL.h>
-
 namespace sdlu
 {
     class Rectangle :
diff --git a/include/graphics/drawable/shapes/Shape.hpp b/include/graphics/drawable/shapes/Shape.hpp
new file mode 100644
index 0000000..b394bd1
--- /dev/null
+++ b/include/graphics/drawable/shapes/Shape.hpp
@@ -0,0 +1,45 @@
+/**
+ * @file Shape.hpp
+ * @brief The base class for all native SDLU shapes
+ * @author Lauchmelder23
+ * @date 23.05.2020
+ */
+#pragma once
+
+#include "graphics/drawable/Drawable.hpp"
+#include "graphics/drawable/Transformable.hpp"
+#include "structures/Color.hpp"
+
+SDLU_BEGIN
+/**
+    * @brief The non-instantiable base class for all SDLU shapes
+    */
+class Shape :
+    public Drawable, public Transformable
+{
+public:
+    /**
+        * @brief Deconstructor
+        */
+    virtual ~Shape();
+
+    /**
+        * @brief Sets the color of the shape
+        */
+    void SetColor(const Color& color);
+
+    /**
+        * @brief Gets the color of the shape
+        */
+    Color GetColor();
+
+protected:
+    /**
+        * @brief Default constructor
+        */
+    Shape();
+
+protected:
+    Color color;
+};
+SDLU_END
\ No newline at end of file
diff --git a/SDLU/structures/Color.hpp b/include/structures/Color.hpp
similarity index 99%
rename from SDLU/structures/Color.hpp
rename to include/structures/Color.hpp
index b1f8387..3456da6 100644
--- a/SDLU/structures/Color.hpp
+++ b/include/structures/Color.hpp
@@ -6,10 +6,9 @@
  */
 #pragma once
 
-#include <Util.hpp>
+#include "Util.hpp"
 
-namespace sdlu
-{
+SDLU_BEGIN
     /**
      * @brief A structure holding color data
      *
@@ -167,4 +166,4 @@ namespace sdlu
          */
         friend bool operator!=(const Color& left, const Color& right);
     };
-}
\ No newline at end of file
+SDLU_END
\ No newline at end of file
diff --git a/include/structures/Mouse.hpp b/include/structures/Mouse.hpp
new file mode 100644
index 0000000..7fa76ed
--- /dev/null
+++ b/include/structures/Mouse.hpp
@@ -0,0 +1,75 @@
+/**
+ * @file Mouse.hpp
+ * @brief A static class to provide easy handling of the mouse
+ * @author Lauchmelder23
+ * @date 19.05.2020
+ */
+#pragma once
+#include "structures/Vector2.hpp"
+#include "graphics/RenderWindow.hpp"
+
+SDLU_BEGIN
+/**
+    * @brief A static class that contains/handles data about
+    *        mouse position and button states
+    */
+class Mouse
+{
+public:
+    /**
+        * @brief Mouse buttons
+        */
+    enum class Button {
+        Left = 1,
+        Right = 2,
+        Middle = 3,
+        XButton1 = 4,
+        XButton2 = 5
+    };
+        
+    /**
+        * @brief Returns the current mouse button state
+        *
+        * @return A 32-bit mask of the current button state
+        */
+    static Uint32 GetButtonState();
+
+    /**
+        * @brief Checks if a specific button is pressed
+        *
+        * @param[in] button The button to check
+        * @return True if the button is pressed
+        */
+    static bool IsButtonDown(Button button);
+
+    /**
+        * @brief Gets the absolute position of the mouse
+        *
+        * @return Current mouse position relative to screen
+        */
+    static Vector2i GetPosition();
+
+    /**
+        * @brief Gets current relative position of the mouse
+        *
+        * @param[in] relativeTo The window the mouse position should be relative to
+        * @return The position of the mouse relative to the top left of the passed window object
+        */
+    static Vector2i GetPosition(const RenderWindow& relativeTo);
+
+    /**
+        * @brief Sets the absolute position of the mouse
+        *
+        * @param[in] position A 2D vector of the new position
+        */
+    static void SetPosition(const Vector2i& position);
+
+    /**
+        * @brief Sets current relative position of the mouse
+        *
+        * @param[in] position A 2D vector of the new position
+        * @param[in] relativeTo The window the mouse position should be relative to
+        */
+    static void SetPosition(const Vector2i& position, const RenderWindow& relativeTo);
+};
+SDLU_END
\ No newline at end of file
diff --git a/include/structures/Vector2.hpp b/include/structures/Vector2.hpp
new file mode 100644
index 0000000..64f2221
--- /dev/null
+++ b/include/structures/Vector2.hpp
@@ -0,0 +1,154 @@
+/**
+ * @file Vector2.hpp
+ * @brief Provides a structure for simple vector calculations
+ * @author Lauchmelder23
+ * @date 16.05.2020
+ */
+#pragma once
+#include <type_traits>
+
+#include "Util.hpp"
+
+SDLU_BEGIN
+/**
+    * @brief A struct to handle basic 2D vector operations.
+    *
+    * @tparam T The (arithmetical) type of the vector components
+    */
+template<
+    typename T,
+    typename = typename std::enable_if<std::is_arithmetic<T>::value, T>::type
+> struct Vector2
+{
+    T x; ///< x component
+    T y; ///< y component
+
+    //################## CONSTRUCTORS ##################//
+
+    /// Initializes a zero vector
+    Vector2() :
+        x(0), y(0)
+    { 
+        // Empty
+    }
+
+    /// Initializes a vector with default values
+    Vector2(T x, T y) :
+        x(x), y(y)
+    {
+        // Empty
+    }
+
+    /// Copies the components of a vector
+    Vector2(const Vector2<T>& other) :
+        x(other.x), y(other.y)
+    {
+        // Empty
+    }
+
+
+    //################## OPERATORS ##################//
+
+    friend Vector2<T> operator-(const Vector2<T>& right)
+    {
+        return Vector2<T>(-right.x, -right.y);
+    }
+
+    friend Vector2<T> operator+(const Vector2<T>& left, const Vector2<T>& right)
+    {
+        return Vector2<T>(left.x + right.x, left.y + right.y);
+    }
+
+    friend Vector2<T> operator-(const Vector2<T>& left, const Vector2<T>& right)
+    {
+        return left + (-right);
+    }
+
+    friend Vector2<T> operator*(const Vector2<T>& left, const Vector2<T>& right)
+    {
+        return Vector2<T>(left.x * right.x, left.y * right.y);
+    }
+
+    friend Vector2<T> operator/(const Vector2<T>& left, const Vector2<T>& right)
+    {
+        return Vector2<T>(left.x / right.x, left.y / right.y);
+    }
+
+    friend Vector2<T> operator*(T left, const Vector2<T>& right)
+    {
+        return Vector2<T>(left * right.x, left * right.y);
+    }
+
+    friend Vector2<T> operator*(const Vector2<T>& left, T right)
+    {
+        return right * left;
+    }
+
+    friend Vector2<T> operator/(const Vector2<T>& left, T right)
+    {
+        return Vector2<T>(left.x / right, left.y / right);
+    }
+
+    friend Vector2<T>& operator+=(Vector2<T>& left, const Vector2<T>& right)
+    {
+        left.x += right.x;
+        left.y += right.y;
+        return left;
+    }
+
+    friend Vector2<T>& operator-=(Vector2<T>& left, const Vector2<T>& right)
+    {
+        left += (-right);
+        return left;
+    }
+
+    friend Vector2<T>& operator*=(Vector2<T>& left, const Vector2<T>& right)
+    {
+        left.x *= right.x;
+        left.y *= right.y;
+        return left;
+    }
+
+    friend Vector2<T>& operator/(Vector2<T>& left, const Vector2<T>& right)
+    {
+        left.x /= right.x;
+        left.y /= right.y;
+        return left;
+    }
+
+    friend Vector2<T>& operator*=(Vector2<T>& left, T right)
+    {
+        left.x *= right;
+        left.y *= right;
+        return left;
+    }
+
+    friend Vector2<T>& operator/=(Vector2<T>& left, T right)
+    {
+        left.x /= right;
+        left.y /= right;
+        return left;
+    }
+
+    friend bool operator==(const Vector2<T>& left, const Vector2<T>& right)
+    {
+        return ((left.x == right.x) && (left.y == right.y));
+    }
+
+    friend bool operator!=(const Vector2<T>& left, const Vector2<T>& right)
+    {
+        return !(left == right);
+    }
+};
+
+
+    
+
+//################## TYPEDEFS ##################//
+
+typedef Vector2<unsigned int>   Vector2u, Vec2u;
+typedef Vector2<int>            Vector2i, Vec2i;
+typedef Vector2<float>          Vector2f, Vec2f;
+typedef Vector2<double>         Vector2d, Vec2d;
+
+SDLU_END
diff --git a/include/structures/Window.hpp b/include/structures/Window.hpp
new file mode 100644
index 0000000..9925cc3
--- /dev/null
+++ b/include/structures/Window.hpp
@@ -0,0 +1,279 @@
+/**
+ * @file Window.hpp
+ * @brief Contains window related objects
+ * @author Lauchmelder23
+ * @date 20.05.20
+ */
+#pragma once
+#include <string>
+#include <chrono>
+
+#include "Util.hpp"
+#include "structures/Vector2.hpp"
+
+struct SDL_Window;
+union SDL_Event;
+struct SDL_Surface;
+struct SDL_Cursor;
+
+SDLU_BEGIN
+/**
+    * @brief Stores information about a window. You probably want RenderWindow.
+    */
+class Window
+{
+public:
+    enum Flags {
+        Fullscreen          = ((Uint32)1 << 0),
+        OpenGL              = ((Uint32)1 << 1),
+        Shown               = ((Uint32)1 << 2),
+        Hidden              = ((Uint32)1 << 3),
+        Borderless          = ((Uint32)1 << 4),
+        Resizable           = ((Uint32)1 << 5),
+        Minimized           = ((Uint32)1 << 6),
+        Maximized           = ((Uint32)1 << 7),
+        InputGrabbed        = ((Uint32)1 << 8),
+        InputFocus          = ((Uint32)1 << 9),
+        MouseFocus          = ((Uint32)1 << 10),
+        Foregin             = ((Uint32)1 << 11),
+        FullscreenDesktop   = (Fullscreen | ((Uint32)1 << 12)),
+    };
+public:
+    /**
+        * @brief Default Constructor. No window is created.
+        */
+    Window();
+
+    /**
+        * @brief Creates a window with the given parameters
+        *
+        * @param[in] dimension A vector containing the width and height
+        * @param[in] title     The title of the create window
+        */
+    Window(Vector2u dimension, const std::string& title,
+        Uint32 windowFlags);
+
+    Window(const Window& other) = delete;
+    Window(Window&& other) = delete;
+
+    virtual ~Window();
+
+    /**
+        * @brief Creates the window.
+        *
+        * This function creates the SDL_Window object. If
+        * they were already created the function does nothing and returns.
+        * If it fails to create either, an ObjectCreationException is thrown.
+        *
+        * @param[in] dimension A vector containing the width and height
+        * @param[in] title     The title of the create window
+        */
+    void Create(Vector2u dimension, const std::string& title,
+        Uint32 windowFlags);
+
+    /**
+        * @brief Destroys the window.
+        */
+    void Close();
+
+    /**
+        * @brief Wether or not the window object is created
+        *
+        * @return True if the window is open, False if not
+        */
+    bool IsOpen() const;
+
+    /**
+        * @brief A non-blocking event polling function
+        *
+        * @param[out] event An object to write the latest event to
+        * @return True if there was an event, False if there wasn't
+        */
+    bool PollEvent(SDL_Event* event);
+
+    /**
+        * @brief A blocking event polling function
+        *
+        * @param[out] event An object to write the latest event to
+        * @return True if an event was polled
+        */
+    bool WaitEvent(SDL_Event* event);
+
+
+    /**
+        * @brief Returns the current position of the window
+        *
+        * @return A vector with the current position relative to the top left corner of the display
+        */
+    Vector2i GetPosition() const;
+
+    /**
+        * @brief Sets a new window position
+        *
+        * @param[in] position A vector with the new position
+        */
+    void SetPosition(Vector2i position);
+
+    /**
+        * @brief Sets a new window position
+        *
+        * @param[in] x The new x position
+        * @param[in] y The new y position
+        */
+    void SetPosition(int x, int y);
+
+
+    /**
+        * @brief Gets the current window size
+        *
+        * @return A vector with the windows size
+        */
+    Vector2u GetSize() const;
+
+    /**
+        * @brief Sets a new window size
+        *
+        * @param[in] size A vector with the new size
+        */
+    void SetSize(Vector2u size);
+
+    /**
+        * @brief Sets a new window size
+        *
+        * @param[in] width  The new width of the window
+        * @param[in] height The new height of the window
+        */
+    void SetSize(unsigned int width, unsigned int height);
+
+
+    /**
+        * @brief Gets the current window title
+        *
+        * @return The title of the widnow
+        */
+    std::string GetTitle() const;
+
+    /**
+        * @brief Sets a new window title
+        *
+        * @param[in] title The new window title
+        */
+    void SetTitle(std::string title);
+
+    /**
+        * @brief Returns a constant pointer to the SDL_Window
+        *
+        * @return A constant pointer to SDL_Window
+        */
+    SDL_Window* const GetWindow() const;
+
+    /**
+        * @brief Set the windows visibility
+        *
+        * @param[in] visible The new visibility setting
+        */
+    void SetVisible(bool visible);
+
+    /**
+        * @brief (De)activates VSync !globally!
+        *
+        * @param[in] vsync Wether to enable or disable vsync
+        */
+    void SetVsync(bool vsync);
+
+    /**
+        * @brief Hides/Shows the mouse cursor inside the windos
+        *
+        * @param[in] visible The new visibility of the cursor
+        */
+    void SetMouseCursorVisible(bool visible);
+
+    /**
+        * @brief Traps the mouse cursor inside the window
+        *
+        * @param[in] grabbed Wether to (un)trap the cursor
+        */
+    void SetMouseCursorGrabbed(bool grabbed);
+
+    /**
+        * @brief Sets the window icon to an array of RGBA values
+        *
+        * @param[in] width  Width of the icon (in px)
+        * @param[in] height Height of the icon (in px)
+        * @param[in] pixels Array of color data (RGBA as seperate 8-Bit integer values)
+        */
+    void SetIcon(Uint32 width, Uint32 height, const Uint8* pixels);
+
+    /**
+        * @brief Sets the window icon to an array of RGBA values
+        *
+        * @param[in] width  Width of the icon (in px)
+        * @param[in] height Height of the icon (in px)
+        * @param[in] pixels Array of color data (RGBA as one 32-Bit integer value)
+        */
+    void SetIcon(Uint32 width, Uint32 height, const Uint32* pixels);
+
+    /**
+        * @brief Sets the window icon to a SDL_Surface
+        *
+        * @param[in] icon A SDL_Surface* holding the icon data
+        */
+    void SetIcon(SDL_Surface* icon);
+
+    /**
+        * @brief Changes the mouse cursor
+        *
+        * @param[in] cursor A pointer to a SDL_Cursor containing cursor data
+        */
+    void SetMouseCursor(SDL_Cursor* cursor);
+
+    /**
+        * @brief Changes the mouse cursor
+        *
+        * @param[in] surface   A pointer to a SDL_Surface containing sprite data
+        * @param[in] clickspot The effective position of the cursor relative to the top left of the sprite
+        */
+    void SetMouseCursor(SDL_Surface* surface, Vector2u clickspot);
+
+    /**
+        * @brief Changes the mouse cursor
+        *
+        * @param[in] pixels    An array of color data (RGBA as seperate 8-bit values)
+        * @param[in] size      Size of the cursor
+        * @param[in] clickspot The effective position of the cursor relative to the top left of the sprite
+        */
+    void SetMouseCursor(const Uint8* pixels, Vector2u size, Vector2u clickspot);
+
+    /**
+        * @brief Changes the mouse cursor
+        *
+        * @param[in] pixels    An array of color data (RGBA as one 32-bit value)
+        * @param[in] size      Size of the cursor
+        * @param[in] clickspot The effective position of the cursor relative to the top left of the sprite
+        */
+    void SetMouseCursor(const Uint32* pixels, Vector2u size, Vector2u clickspot);
+
+protected:
+    SDL_Window* window;
+
+protected:
+    /**
+    * @brief This function is called after Create() finishes
+    */
+    virtual void OnCreate();
+
+    /**
+        * @brief This function is called after a SDL_WINDOWEVENT_RESIZED is polled.
+        *        (PollEvent() must be called for this to work)
+        *
+        * @return True if the resize event should be popped from the event queue before
+                returning the polled event to the user
+        */
+    virtual bool OnResize();
+
+    /**
+        * @brief This function is called after Close() finishes.
+        */
+    virtual void OnClose();
+};
+SDLU_END
\ No newline at end of file
diff --git a/lib/sdl2_gfx/CMakeLists.txt b/lib/sdl2_gfx/CMakeLists.txt
new file mode 100644
index 0000000..12d6948
--- /dev/null
+++ b/lib/sdl2_gfx/CMakeLists.txt
@@ -0,0 +1,20 @@
+file(GLOB_RECURSE sdl2gfx_includes
+	"include/*.h"
+)
+
+file(GLOB_RECURSE sdl2gfx_sources
+	"src/*.c"
+)
+
+add_library(sdl2_gfx 
+	${sdl2gfx_includes} ${sdl2gfx_sources}
+)
+
+target_include_directories(sdl2_gfx PUBLIC 
+	SDL2
+	"include"
+)
+
+target_link_libraries(sdl2_gfx PUBLIC 
+	SDL2::SDL2
+)
\ No newline at end of file
diff --git a/lib/sdl2_gfx/include/SDL2_framerate.h b/lib/sdl2_gfx/include/SDL2_framerate.h
new file mode 100644
index 0000000..cd5f9b3
--- /dev/null
+++ b/lib/sdl2_gfx/include/SDL2_framerate.h
@@ -0,0 +1,100 @@
+/*
+
+SDL2_framerate.h: framerate manager
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL2_framerate_h
+#define _SDL2_framerate_h
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/* --- */
+
+#include "SDL.h"
+
+	/* --------- Definitions */
+
+	/*!
+	\brief Highest possible rate supported by framerate controller in Hz (1/s).
+	*/
+#define FPS_UPPER_LIMIT		200
+
+	/*!
+	\brief Lowest possible rate supported by framerate controller in Hz (1/s).
+	*/
+#define FPS_LOWER_LIMIT		1
+
+	/*!
+	\brief Default rate of framerate controller in Hz (1/s).
+	*/
+#define FPS_DEFAULT		30
+
+	/*! 
+	\brief Structure holding the state and timing information of the framerate controller. 
+	*/
+	typedef struct {
+		Uint32 framecount;
+		float rateticks;
+		Uint32 baseticks;
+		Uint32 lastticks;
+		Uint32 rate;
+	} FPSmanager;
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL2_GFX_DLL_IMPORT)
+#    define SDL2_FRAMERATE_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL2_GFX_DLL_IMPORT
+#      define SDL2_FRAMERATE_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL2_FRAMERATE_SCOPE
+#  define SDL2_FRAMERATE_SCOPE extern
+#endif
+
+	/* Functions return 0 or value for sucess and -1 for error */
+
+	SDL2_FRAMERATE_SCOPE void SDL_initFramerate(FPSmanager * manager);
+	SDL2_FRAMERATE_SCOPE int SDL_setFramerate(FPSmanager * manager, Uint32 rate);
+	SDL2_FRAMERATE_SCOPE int SDL_getFramerate(FPSmanager * manager);
+	SDL2_FRAMERATE_SCOPE int SDL_getFramecount(FPSmanager * manager);
+	SDL2_FRAMERATE_SCOPE Uint32 SDL_framerateDelay(FPSmanager * manager);
+
+	/* --- */
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL2_framerate_h */
diff --git a/lib/sdl2_gfx/include/SDL2_gfxPrimitives.h b/lib/sdl2_gfx/include/SDL2_gfxPrimitives.h
new file mode 100644
index 0000000..57a9830
--- /dev/null
+++ b/lib/sdl2_gfx/include/SDL2_gfxPrimitives.h
@@ -0,0 +1,241 @@
+/* 
+
+SDL2_gfxPrimitives.h: graphics primitives for SDL
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL2_gfxPrimitives_h
+#define _SDL2_gfxPrimitives_h
+
+#include <math.h>
+#ifndef M_PI
+#define M_PI	3.1415926535897932384626433832795
+#endif
+
+#include "SDL.h"
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/* ----- Versioning */
+
+#define SDL2_GFXPRIMITIVES_MAJOR	1
+#define SDL2_GFXPRIMITIVES_MINOR	0
+#define SDL2_GFXPRIMITIVES_MICRO	3
+
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL2_GFX_DLL_IMPORT)
+#    define SDL2_GFXPRIMITIVES_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL2_GFX_DLL_IMPORT
+#      define SDL2_GFXPRIMITIVES_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL2_GFXPRIMITIVES_SCOPE
+#  define SDL2_GFXPRIMITIVES_SCOPE extern
+#endif
+
+	/* Note: all ___Color routines expect the color to be in format 0xRRGGBBAA */
+
+	/* Pixel */
+
+	SDL2_GFXPRIMITIVES_SCOPE int pixelColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int pixelRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Horizontal line */
+
+	SDL2_GFXPRIMITIVES_SCOPE int hlineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 x2, Sint16 y, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int hlineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 x2, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Vertical line */
+
+	SDL2_GFXPRIMITIVES_SCOPE int vlineColor(SDL_Renderer * renderer, Sint16 x, Sint16 y1, Sint16 y2, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int vlineRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y1, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Rectangle */
+
+	SDL2_GFXPRIMITIVES_SCOPE int rectangleColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int rectangleRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Rounded-Corner Rectangle */
+
+	SDL2_GFXPRIMITIVES_SCOPE int roundedRectangleColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int roundedRectangleRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled rectangle (Box) */
+
+	SDL2_GFXPRIMITIVES_SCOPE int boxColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int boxRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2,
+		Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Rounded-Corner Filled rectangle (Box) */
+
+	SDL2_GFXPRIMITIVES_SCOPE int roundedBoxColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int roundedBoxRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2,
+		Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Line */
+
+	SDL2_GFXPRIMITIVES_SCOPE int lineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int lineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA Line */
+
+	SDL2_GFXPRIMITIVES_SCOPE int aalineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int aalineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Thick Line */
+	SDL2_GFXPRIMITIVES_SCOPE int thickLineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, 
+		Uint8 width, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int thickLineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, 
+		Uint8 width, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Circle */
+
+	SDL2_GFXPRIMITIVES_SCOPE int circleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int circleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Arc */
+
+	SDL2_GFXPRIMITIVES_SCOPE int arcColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int arcRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, 
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA Circle */
+
+	SDL2_GFXPRIMITIVES_SCOPE int aacircleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int aacircleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y,
+		Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Circle */
+
+	SDL2_GFXPRIMITIVES_SCOPE int filledCircleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 r, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int filledCircleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y,
+		Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Ellipse */
+
+	SDL2_GFXPRIMITIVES_SCOPE int ellipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int ellipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y,
+		Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA Ellipse */
+
+	SDL2_GFXPRIMITIVES_SCOPE int aaellipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int aaellipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y,
+		Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Ellipse */
+
+	SDL2_GFXPRIMITIVES_SCOPE int filledEllipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int filledEllipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y,
+		Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Pie */
+
+	SDL2_GFXPRIMITIVES_SCOPE int pieColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int pieRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Pie */
+
+	SDL2_GFXPRIMITIVES_SCOPE int filledPieColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int filledPieRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Trigon */
+
+	SDL2_GFXPRIMITIVES_SCOPE int trigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int trigonRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA-Trigon */
+
+	SDL2_GFXPRIMITIVES_SCOPE int aatrigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int aatrigonRGBA(SDL_Renderer * renderer,  Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Trigon */
+
+	SDL2_GFXPRIMITIVES_SCOPE int filledTrigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int filledTrigonRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Polygon */
+
+	SDL2_GFXPRIMITIVES_SCOPE int polygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int polygonRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy,
+		int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA-Polygon */
+
+	SDL2_GFXPRIMITIVES_SCOPE int aapolygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int aapolygonRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy,
+		int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Polygon */
+
+	SDL2_GFXPRIMITIVES_SCOPE int filledPolygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int filledPolygonRGBA(SDL_Renderer * renderer, const Sint16 * vx,
+		const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Textured Polygon */
+
+	SDL2_GFXPRIMITIVES_SCOPE int texturedPolygon(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, SDL_Surface * texture,int texture_dx,int texture_dy);
+
+	/* Bezier */
+
+	SDL2_GFXPRIMITIVES_SCOPE int bezierColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, int s, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int bezierRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy,
+		int n, int s, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Characters/Strings */
+
+	SDL2_GFXPRIMITIVES_SCOPE void gfxPrimitivesSetFont(const void *fontdata, Uint32 cw, Uint32 ch);
+	SDL2_GFXPRIMITIVES_SCOPE void gfxPrimitivesSetFontRotation(Uint32 rotation);
+	SDL2_GFXPRIMITIVES_SCOPE int characterColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, char c, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int characterRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, char c, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+	SDL2_GFXPRIMITIVES_SCOPE int stringColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, const char *s, Uint32 color);
+	SDL2_GFXPRIMITIVES_SCOPE int stringRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, const char *s, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL2_gfxPrimitives_h */
diff --git a/lib/sdl2_gfx/include/SDL2_gfxPrimitives_font.h b/lib/sdl2_gfx/include/SDL2_gfxPrimitives_font.h
new file mode 100644
index 0000000..e23af65
--- /dev/null
+++ b/lib/sdl2_gfx/include/SDL2_gfxPrimitives_font.h
@@ -0,0 +1,3106 @@
+/*
+
+SDL2_gfxPrimitives_font.h: 8x8 font definition
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#define GFX_FONTDATAMAX (8*256)
+
+static unsigned char gfxPrimitivesFontdata[GFX_FONTDATAMAX] = {
+
+	/*
+	* 0 0x00 '^@' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 1 0x01 '^A' 
+	*/
+	0x7e,			/* 01111110 */
+	0x81,			/* 10000001 */
+	0xa5,			/* 10100101 */
+	0x81,			/* 10000001 */
+	0xbd,			/* 10111101 */
+	0x99,			/* 10011001 */
+	0x81,			/* 10000001 */
+	0x7e,			/* 01111110 */
+
+	/*
+	* 2 0x02 '^B' 
+	*/
+	0x7e,			/* 01111110 */
+	0xff,			/* 11111111 */
+	0xdb,			/* 11011011 */
+	0xff,			/* 11111111 */
+	0xc3,			/* 11000011 */
+	0xe7,			/* 11100111 */
+	0xff,			/* 11111111 */
+	0x7e,			/* 01111110 */
+
+	/*
+	* 3 0x03 '^C' 
+	*/
+	0x6c,			/* 01101100 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0x7c,			/* 01111100 */
+	0x38,			/* 00111000 */
+	0x10,			/* 00010000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 4 0x04 '^D' 
+	*/
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x7c,			/* 01111100 */
+	0xfe,			/* 11111110 */
+	0x7c,			/* 01111100 */
+	0x38,			/* 00111000 */
+	0x10,			/* 00010000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 5 0x05 '^E' 
+	*/
+	0x38,			/* 00111000 */
+	0x7c,			/* 01111100 */
+	0x38,			/* 00111000 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0xd6,			/* 11010110 */
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+
+	/*
+	* 6 0x06 '^F' 
+	*/
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x7c,			/* 01111100 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0x7c,			/* 01111100 */
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+
+	/*
+	* 7 0x07 '^G' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 8 0x08 '^H' 
+	*/
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xe7,			/* 11100111 */
+	0xc3,			/* 11000011 */
+	0xc3,			/* 11000011 */
+	0xe7,			/* 11100111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 9 0x09 '^I' 
+	*/
+	0x00,			/* 00000000 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x42,			/* 01000010 */
+	0x42,			/* 01000010 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 10 0x0a '^J' 
+	*/
+	0xff,			/* 11111111 */
+	0xc3,			/* 11000011 */
+	0x99,			/* 10011001 */
+	0xbd,			/* 10111101 */
+	0xbd,			/* 10111101 */
+	0x99,			/* 10011001 */
+	0xc3,			/* 11000011 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 11 0x0b '^K' 
+	*/
+	0x0f,			/* 00001111 */
+	0x07,			/* 00000111 */
+	0x0f,			/* 00001111 */
+	0x7d,			/* 01111101 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x78,			/* 01111000 */
+
+	/*
+	* 12 0x0c '^L' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 13 0x0d '^M' 
+	*/
+	0x3f,			/* 00111111 */
+	0x33,			/* 00110011 */
+	0x3f,			/* 00111111 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x70,			/* 01110000 */
+	0xf0,			/* 11110000 */
+	0xe0,			/* 11100000 */
+
+	/*
+	* 14 0x0e '^N' 
+	*/
+	0x7f,			/* 01111111 */
+	0x63,			/* 01100011 */
+	0x7f,			/* 01111111 */
+	0x63,			/* 01100011 */
+	0x63,			/* 01100011 */
+	0x67,			/* 01100111 */
+	0xe6,			/* 11100110 */
+	0xc0,			/* 11000000 */
+
+	/*
+	* 15 0x0f '^O' 
+	*/
+	0x18,			/* 00011000 */
+	0xdb,			/* 11011011 */
+	0x3c,			/* 00111100 */
+	0xe7,			/* 11100111 */
+	0xe7,			/* 11100111 */
+	0x3c,			/* 00111100 */
+	0xdb,			/* 11011011 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 16 0x10 '^P' 
+	*/
+	0x80,			/* 10000000 */
+	0xe0,			/* 11100000 */
+	0xf8,			/* 11111000 */
+	0xfe,			/* 11111110 */
+	0xf8,			/* 11111000 */
+	0xe0,			/* 11100000 */
+	0x80,			/* 10000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 17 0x11 '^Q' 
+	*/
+	0x02,			/* 00000010 */
+	0x0e,			/* 00001110 */
+	0x3e,			/* 00111110 */
+	0xfe,			/* 11111110 */
+	0x3e,			/* 00111110 */
+	0x0e,			/* 00001110 */
+	0x02,			/* 00000010 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 18 0x12 '^R' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 19 0x13 '^S' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 20 0x14 '^T' 
+	*/
+	0x7f,			/* 01111111 */
+	0xdb,			/* 11011011 */
+	0xdb,			/* 11011011 */
+	0x7b,			/* 01111011 */
+	0x1b,			/* 00011011 */
+	0x1b,			/* 00011011 */
+	0x1b,			/* 00011011 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 21 0x15 '^U' 
+	*/
+	0x3e,			/* 00111110 */
+	0x61,			/* 01100001 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x86,			/* 10000110 */
+	0x7c,			/* 01111100 */
+
+	/*
+	* 22 0x16 '^V' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x7e,			/* 01111110 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 23 0x17 '^W' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 24 0x18 '^X' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 25 0x19 '^Y' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 26 0x1a '^Z' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0xfe,			/* 11111110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 27 0x1b '^[' 
+	*/
+	0x00,			/* 00000000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0xfe,			/* 11111110 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 28 0x1c '^\' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 29 0x1d '^]' 
+	*/
+	0x00,			/* 00000000 */
+	0x24,			/* 00100100 */
+	0x66,			/* 01100110 */
+	0xff,			/* 11111111 */
+	0x66,			/* 01100110 */
+	0x24,			/* 00100100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 30 0x1e '^^' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 31 0x1f '^_' 
+	*/
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 32 0x20 ' ' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 33 0x21 '!' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 34 0x22 '"' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x24,			/* 00100100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 35 0x23 '#' 
+	*/
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 36 0x24 '$' 
+	*/
+	0x18,			/* 00011000 */
+	0x3e,			/* 00111110 */
+	0x60,			/* 01100000 */
+	0x3c,			/* 00111100 */
+	0x06,			/* 00000110 */
+	0x7c,			/* 01111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 37 0x25 '%' 
+	*/
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xcc,			/* 11001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x66,			/* 01100110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 38 0x26 '&' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 39 0x27 ''' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 40 0x28 '(' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 41 0x29 ')' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 42 0x2a '*' 
+	*/
+	0x00,			/* 00000000 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0xff,			/* 11111111 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 43 0x2b '+' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 44 0x2c ',' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+
+	/*
+	* 45 0x2d '-' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 46 0x2e '.' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 47 0x2f '/' 
+	*/
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0xc0,			/* 11000000 */
+	0x80,			/* 10000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 48 0x30 '0' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xd6,			/* 11010110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 49 0x31 '1' 
+	*/
+	0x18,			/* 00011000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 50 0x32 '2' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0x06,			/* 00000110 */
+	0x1c,			/* 00011100 */
+	0x30,			/* 00110000 */
+	0x66,			/* 01100110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 51 0x33 '3' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0x06,			/* 00000110 */
+	0x3c,			/* 00111100 */
+	0x06,			/* 00000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 52 0x34 '4' 
+	*/
+	0x1c,			/* 00011100 */
+	0x3c,			/* 00111100 */
+	0x6c,			/* 01101100 */
+	0xcc,			/* 11001100 */
+	0xfe,			/* 11111110 */
+	0x0c,			/* 00001100 */
+	0x1e,			/* 00011110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 53 0x35 '5' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xfc,			/* 11111100 */
+	0x06,			/* 00000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 54 0x36 '6' 
+	*/
+	0x38,			/* 00111000 */
+	0x60,			/* 01100000 */
+	0xc0,			/* 11000000 */
+	0xfc,			/* 11111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 55 0x37 '7' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 56 0x38 '8' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 57 0x39 '9' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7e,			/* 01111110 */
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x78,			/* 01111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 58 0x3a ':' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 59 0x3b ';' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+
+	/*
+	* 60 0x3c '<' 
+	*/
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x06,			/* 00000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 61 0x3d '=' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 62 0x3e '>' 
+	*/
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 63 0x3f '?' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 64 0x40 '@' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xde,			/* 11011110 */
+	0xde,			/* 11011110 */
+	0xde,			/* 11011110 */
+	0xc0,			/* 11000000 */
+	0x78,			/* 01111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 65 0x41 'A' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 66 0x42 'B' 
+	*/
+	0xfc,			/* 11111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0xfc,			/* 11111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 67 0x43 'C' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 68 0x44 'D' 
+	*/
+	0xf8,			/* 11111000 */
+	0x6c,			/* 01101100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x6c,			/* 01101100 */
+	0xf8,			/* 11111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 69 0x45 'E' 
+	*/
+	0xfe,			/* 11111110 */
+	0x62,			/* 01100010 */
+	0x68,			/* 01101000 */
+	0x78,			/* 01111000 */
+	0x68,			/* 01101000 */
+	0x62,			/* 01100010 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 70 0x46 'F' 
+	*/
+	0xfe,			/* 11111110 */
+	0x62,			/* 01100010 */
+	0x68,			/* 01101000 */
+	0x78,			/* 01111000 */
+	0x68,			/* 01101000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 71 0x47 'G' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xce,			/* 11001110 */
+	0x66,			/* 01100110 */
+	0x3a,			/* 00111010 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 72 0x48 'H' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 73 0x49 'I' 
+	*/
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 74 0x4a 'J' 
+	*/
+	0x1e,			/* 00011110 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x78,			/* 01111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 75 0x4b 'K' 
+	*/
+	0xe6,			/* 11100110 */
+	0x66,			/* 01100110 */
+	0x6c,			/* 01101100 */
+	0x78,			/* 01111000 */
+	0x6c,			/* 01101100 */
+	0x66,			/* 01100110 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 76 0x4c 'L' 
+	*/
+	0xf0,			/* 11110000 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0x62,			/* 01100010 */
+	0x66,			/* 01100110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 77 0x4d 'M' 
+	*/
+	0xc6,			/* 11000110 */
+	0xee,			/* 11101110 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0xd6,			/* 11010110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 78 0x4e 'N' 
+	*/
+	0xc6,			/* 11000110 */
+	0xe6,			/* 11100110 */
+	0xf6,			/* 11110110 */
+	0xde,			/* 11011110 */
+	0xce,			/* 11001110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 79 0x4f 'O' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 80 0x50 'P' 
+	*/
+	0xfc,			/* 11111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 81 0x51 'Q' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xce,			/* 11001110 */
+	0x7c,			/* 01111100 */
+	0x0e,			/* 00001110 */
+
+	/*
+	* 82 0x52 'R' 
+	*/
+	0xfc,			/* 11111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x6c,			/* 01101100 */
+	0x66,			/* 01100110 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 83 0x53 'S' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 84 0x54 'T' 
+	*/
+	0x7e,			/* 01111110 */
+	0x7e,			/* 01111110 */
+	0x5a,			/* 01011010 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 85 0x55 'U' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 86 0x56 'V' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 87 0x57 'W' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 88 0x58 'X' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 89 0x59 'Y' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 90 0x5a 'Z' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x8c,			/* 10001100 */
+	0x18,			/* 00011000 */
+	0x32,			/* 00110010 */
+	0x66,			/* 01100110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 91 0x5b '[' 
+	*/
+	0x3c,			/* 00111100 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 92 0x5c '\' 
+	*/
+	0xc0,			/* 11000000 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x06,			/* 00000110 */
+	0x02,			/* 00000010 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 93 0x5d ']' 
+	*/
+	0x3c,			/* 00111100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 94 0x5e '^' 
+	*/
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 95 0x5f '_' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 96 0x60 '`' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 97 0x61 'a' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 98 0x62 'b' 
+	*/
+	0xe0,			/* 11100000 */
+	0x60,			/* 01100000 */
+	0x7c,			/* 01111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 99 0x63 'c' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc0,			/* 11000000 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 100 0x64 'd' 
+	*/
+	0x1c,			/* 00011100 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 101 0x65 'e' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 102 0x66 'f' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x60,			/* 01100000 */
+	0xf8,			/* 11111000 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 103 0x67 'g' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x7c,			/* 01111100 */
+	0x0c,			/* 00001100 */
+	0xf8,			/* 11111000 */
+
+	/*
+	* 104 0x68 'h' 
+	*/
+	0xe0,			/* 11100000 */
+	0x60,			/* 01100000 */
+	0x6c,			/* 01101100 */
+	0x76,			/* 01110110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 105 0x69 'i' 
+	*/
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 106 0x6a 'j' 
+	*/
+	0x06,			/* 00000110 */
+	0x00,			/* 00000000 */
+	0x06,			/* 00000110 */
+	0x06,			/* 00000110 */
+	0x06,			/* 00000110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+
+	/*
+	* 107 0x6b 'k' 
+	*/
+	0xe0,			/* 11100000 */
+	0x60,			/* 01100000 */
+	0x66,			/* 01100110 */
+	0x6c,			/* 01101100 */
+	0x78,			/* 01111000 */
+	0x6c,			/* 01101100 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 108 0x6c 'l' 
+	*/
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 109 0x6d 'm' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xec,			/* 11101100 */
+	0xfe,			/* 11111110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 110 0x6e 'n' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 111 0x6f 'o' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 112 0x70 'p' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+
+	/*
+	* 113 0x71 'q' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x7c,			/* 01111100 */
+	0x0c,			/* 00001100 */
+	0x1e,			/* 00011110 */
+
+	/*
+	* 114 0x72 'r' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x76,			/* 01110110 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 115 0x73 's' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x06,			/* 00000110 */
+	0xfc,			/* 11111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 116 0x74 't' 
+	*/
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0xfc,			/* 11111100 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x36,			/* 00110110 */
+	0x1c,			/* 00011100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 117 0x75 'u' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 118 0x76 'v' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 119 0x77 'w' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 120 0x78 'x' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 121 0x79 'y' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7e,			/* 01111110 */
+	0x06,			/* 00000110 */
+	0xfc,			/* 11111100 */
+
+	/*
+	* 122 0x7a 'z' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x4c,			/* 01001100 */
+	0x18,			/* 00011000 */
+	0x32,			/* 00110010 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 123 0x7b '{' 
+	*/
+	0x0e,			/* 00001110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x70,			/* 01110000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x0e,			/* 00001110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 124 0x7c '|' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 125 0x7d '}' 
+	*/
+	0x70,			/* 01110000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x0e,			/* 00001110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x70,			/* 01110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 126 0x7e '~' 
+	*/
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 127 0x7f '' 
+	*/
+	0x00,			/* 00000000 */
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 128 0x80 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x0c,			/* 00001100 */
+	0x78,			/* 01111000 */
+
+	/*
+	* 129 0x81 '�' 
+	*/
+	0xcc,			/* 11001100 */
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 130 0x82 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 131 0x83 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 132 0x84 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 133 0x85 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 134 0x86 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 135 0x87 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x7e,			/* 01111110 */
+	0x0c,			/* 00001100 */
+	0x38,			/* 00111000 */
+
+	/*
+	* 136 0x88 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 137 0x89 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 138 0x8a '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 139 0x8b '�' 
+	*/
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 140 0x8c '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 141 0x8d '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 142 0x8e '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 143 0x8f '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 144 0x90 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0xf8,			/* 11111000 */
+	0xc0,			/* 11000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 145 0x91 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0xd8,			/* 11011000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 146 0x92 '�' 
+	*/
+	0x3e,			/* 00111110 */
+	0x6c,			/* 01101100 */
+	0xcc,			/* 11001100 */
+	0xfe,			/* 11111110 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xce,			/* 11001110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 147 0x93 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 148 0x94 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 149 0x95 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 150 0x96 '�' 
+	*/
+	0x78,			/* 01111000 */
+	0x84,			/* 10000100 */
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 151 0x97 '�' 
+	*/
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 152 0x98 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7e,			/* 01111110 */
+	0x06,			/* 00000110 */
+	0xfc,			/* 11111100 */
+
+	/*
+	* 153 0x99 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 154 0x9a '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 155 0x9b '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 156 0x9c '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x64,			/* 01100100 */
+	0xf0,			/* 11110000 */
+	0x60,			/* 01100000 */
+	0x66,			/* 01100110 */
+	0xfc,			/* 11111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 157 0x9d '�' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 158 0x9e '�' 
+	*/
+	0xf8,			/* 11111000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xfa,			/* 11111010 */
+	0xc6,			/* 11000110 */
+	0xcf,			/* 11001111 */
+	0xc6,			/* 11000110 */
+	0xc7,			/* 11000111 */
+
+	/*
+	* 159 0x9f '�' 
+	*/
+	0x0e,			/* 00001110 */
+	0x1b,			/* 00011011 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0xd8,			/* 11011000 */
+	0x70,			/* 01110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 160 0xa0 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 161 0xa1 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 162 0xa2 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 163 0xa3 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 164 0xa4 '�' 
+	*/
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 165 0xa5 '�' 
+	*/
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0xe6,			/* 11100110 */
+	0xf6,			/* 11110110 */
+	0xde,			/* 11011110 */
+	0xce,			/* 11001110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 166 0xa6 '�' 
+	*/
+	0x3c,			/* 00111100 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x3e,			/* 00111110 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 167 0xa7 '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 168 0xa8 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x63,			/* 01100011 */
+	0x3e,			/* 00111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 169 0xa9 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 170 0xaa '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x06,			/* 00000110 */
+	0x06,			/* 00000110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 171 0xab '�' 
+	*/
+	0x63,			/* 01100011 */
+	0xe6,			/* 11100110 */
+	0x6c,			/* 01101100 */
+	0x7e,			/* 01111110 */
+	0x33,			/* 00110011 */
+	0x66,			/* 01100110 */
+	0xcc,			/* 11001100 */
+	0x0f,			/* 00001111 */
+
+	/*
+	* 172 0xac '�' 
+	*/
+	0x63,			/* 01100011 */
+	0xe6,			/* 11100110 */
+	0x6c,			/* 01101100 */
+	0x7a,			/* 01111010 */
+	0x36,			/* 00110110 */
+	0x6a,			/* 01101010 */
+	0xdf,			/* 11011111 */
+	0x06,			/* 00000110 */
+
+	/*
+	* 173 0xad '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 174 0xae '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x33,			/* 00110011 */
+	0x66,			/* 01100110 */
+	0xcc,			/* 11001100 */
+	0x66,			/* 01100110 */
+	0x33,			/* 00110011 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 175 0xaf '�' 
+	*/
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0x66,			/* 01100110 */
+	0x33,			/* 00110011 */
+	0x66,			/* 01100110 */
+	0xcc,			/* 11001100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 176 0xb0 '�' 
+	*/
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+
+	/*
+	* 177 0xb1 '�' 
+	*/
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+
+	/*
+	* 178 0xb2 '�' 
+	*/
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+
+	/*
+	* 179 0xb3 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 180 0xb4 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 181 0xb5 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 182 0xb6 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf6,			/* 11110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 183 0xb7 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 184 0xb8 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 185 0xb9 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf6,			/* 11110110 */
+	0x06,			/* 00000110 */
+	0xf6,			/* 11110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 186 0xba '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 187 0xbb '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x06,			/* 00000110 */
+	0xf6,			/* 11110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 188 0xbc '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf6,			/* 11110110 */
+	0x06,			/* 00000110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 189 0xbd '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 190 0xbe '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 191 0xbf '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 192 0xc0 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 193 0xc1 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 194 0xc2 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 195 0xc3 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 196 0xc4 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 197 0xc5 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 198 0xc6 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 199 0xc7 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x37,			/* 00110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 200 0xc8 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x37,			/* 00110111 */
+	0x30,			/* 00110000 */
+	0x3f,			/* 00111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 201 0xc9 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x3f,			/* 00111111 */
+	0x30,			/* 00110000 */
+	0x37,			/* 00110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 202 0xca '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf7,			/* 11110111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 203 0xcb '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xf7,			/* 11110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 204 0xcc '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x37,			/* 00110111 */
+	0x30,			/* 00110000 */
+	0x37,			/* 00110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 205 0xcd '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 206 0xce '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf7,			/* 11110111 */
+	0x00,			/* 00000000 */
+	0xf7,			/* 11110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 207 0xcf '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 208 0xd0 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 209 0xd1 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 210 0xd2 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 211 0xd3 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x3f,			/* 00111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 212 0xd4 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 213 0xd5 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 214 0xd6 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x3f,			/* 00111111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 215 0xd7 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xff,			/* 11111111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 216 0xd8 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 217 0xd9 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 218 0xda '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 219 0xdb '�' 
+	*/
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 220 0xdc '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 221 0xdd '�' 
+	*/
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+
+	/*
+	* 222 0xde '�' 
+	*/
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+
+	/*
+	* 223 0xdf '�' 
+	*/
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 224 0xe0 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0xc8,			/* 11001000 */
+	0xdc,			/* 11011100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 225 0xe1 '�' 
+	*/
+	0x78,			/* 01111000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xd8,			/* 11011000 */
+	0xcc,			/* 11001100 */
+	0xc6,			/* 11000110 */
+	0xcc,			/* 11001100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 226 0xe2 '�' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 227 0xe3 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 228 0xe4 '�' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 229 0xe5 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xd8,			/* 11011000 */
+	0xd8,			/* 11011000 */
+	0xd8,			/* 11011000 */
+	0x70,			/* 01110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 230 0xe6 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0xc0,			/* 11000000 */
+
+	/*
+	* 231 0xe7 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 232 0xe8 '�' 
+	*/
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+
+	/*
+	* 233 0xe9 '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 234 0xea '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0xee,			/* 11101110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 235 0xeb '�' 
+	*/
+	0x0e,			/* 00001110 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x3e,			/* 00111110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 236 0xec '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xdb,			/* 11011011 */
+	0xdb,			/* 11011011 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 237 0xed '�' 
+	*/
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x7e,			/* 01111110 */
+	0xdb,			/* 11011011 */
+	0xdb,			/* 11011011 */
+	0x7e,			/* 01111110 */
+	0x60,			/* 01100000 */
+	0xc0,			/* 11000000 */
+
+	/*
+	* 238 0xee '�' 
+	*/
+	0x1e,			/* 00011110 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0x7e,			/* 01111110 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x1e,			/* 00011110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 239 0xef '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 240 0xf0 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 241 0xf1 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 242 0xf2 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 243 0xf3 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 244 0xf4 '�' 
+	*/
+	0x0e,			/* 00001110 */
+	0x1b,			/* 00011011 */
+	0x1b,			/* 00011011 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 245 0xf5 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xd8,			/* 11011000 */
+	0xd8,			/* 11011000 */
+	0x70,			/* 01110000 */
+
+	/*
+	* 246 0xf6 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 247 0xf7 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 248 0xf8 '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 249 0xf9 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 250 0xfa '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 251 0xfb '�' 
+	*/
+	0x0f,			/* 00001111 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0xec,			/* 11101100 */
+	0x6c,			/* 01101100 */
+	0x3c,			/* 00111100 */
+	0x1c,			/* 00011100 */
+
+	/*
+	* 252 0xfc '�' 
+	*/
+	0x6c,			/* 01101100 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 253 0xfd '�' 
+	*/
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 254 0xfe '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 255 0xff ' ' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+};
diff --git a/lib/sdl2_gfx/include/SDL2_imageFilter.h b/lib/sdl2_gfx/include/SDL2_imageFilter.h
new file mode 100644
index 0000000..8ea6404
--- /dev/null
+++ b/lib/sdl2_gfx/include/SDL2_imageFilter.h
@@ -0,0 +1,166 @@
+/*
+
+SDL2_imageFilter.h: byte-image "filter" routines 
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL2_imageFilter_h
+#define _SDL2_imageFilter_h
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL2_GFX_DLL_IMPORT)
+#    define SDL2_IMAGEFILTER_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL2_GFX_DLL_IMPORT
+#      define SDL2_IMAGEFILTER_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL2_IMAGEFILTER_SCOPE
+#  define SDL2_IMAGEFILTER_SCOPE extern
+#endif
+
+	/* Comments:                                                                           */
+	/*  1.) MMX functions work best if all data blocks are aligned on a 32 bytes boundary. */
+	/*  2.) Data that is not within an 8 byte boundary is processed using the C routine.   */
+	/*  3.) Convolution routines do not have C routines at this time.                      */
+
+	// Detect MMX capability in CPU
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterMMXdetect(void);
+
+	// Force use of MMX off (or turn possible use back on)
+	SDL2_IMAGEFILTER_SCOPE void SDL_imageFilterMMXoff(void);
+	SDL2_IMAGEFILTER_SCOPE void SDL_imageFilterMMXon(void);
+
+	//
+	// All routines return:
+	//   0   OK
+	//  -1   Error (internal error, parameter error)
+	//
+
+	//  SDL_imageFilterAdd: D = saturation255(S1 + S2)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMean: D = S1/2 + S2/2
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterSub: D = saturation0(S1 - S2)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterAbsDiff: D = | S1 - S2 |
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMult: D = saturation(S1 * S2)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMultNor: D = S1 * S2   (non-MMX)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMultDivby2: D = saturation255(S1/2 * S2)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest,
+		unsigned int length);
+
+	//  SDL_imageFilterMultDivby4: D = saturation255(S1/2 * S2/2)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest,
+		unsigned int length);
+
+	//  SDL_imageFilterBitAnd: D = S1 & S2
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterBitOr: D = S1 | S2
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterDiv: D = S1 / S2   (non-MMX)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterBitNegation: D = !S
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterAddByte: D = saturation255(S + C)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C);
+
+	//  SDL_imageFilterAddUint: D = saturation255(S + (uint)C)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C);
+
+	//  SDL_imageFilterAddByteToHalf: D = saturation255(S/2 + C)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char C);
+
+	//  SDL_imageFilterSubByte: D = saturation0(S - C)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C);
+
+	//  SDL_imageFilterSubUint: D = saturation0(S - (uint)C)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C);
+
+	//  SDL_imageFilterShiftRight: D = saturation0(S >> N)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N);
+
+	//  SDL_imageFilterShiftRightUint: D = saturation0((uint)S >> N)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N);
+
+	//  SDL_imageFilterMultByByte: D = saturation255(S * C)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C);
+
+	//  SDL_imageFilterShiftRightAndMultByByte: D = saturation255((S >> N) * C)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char N, unsigned char C);
+
+	//  SDL_imageFilterShiftLeftByte: D = (S << N)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char N);
+
+	//  SDL_imageFilterShiftLeftUint: D = ((uint)S << N)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char N);
+
+	//  SDL_imageFilterShiftLeft: D = saturation255(S << N)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N);
+
+	//  SDL_imageFilterBinarizeUsingThreshold: D = S >= T ? 255:0
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char T);
+
+	//  SDL_imageFilterClipToRange: D = (S >= Tmin) & (S <= Tmax) 255:0
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char Tmin, unsigned char Tmax);
+
+	//  SDL_imageFilterNormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
+	SDL2_IMAGEFILTER_SCOPE int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin,
+		int Cmax, int Nmin, int Nmax);
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL_imageFilter_h */
diff --git a/lib/sdl2_gfx/include/SDL2_rotozoom.h b/lib/sdl2_gfx/include/SDL2_rotozoom.h
new file mode 100644
index 0000000..d9fa577
--- /dev/null
+++ b/lib/sdl2_gfx/include/SDL2_rotozoom.h
@@ -0,0 +1,123 @@
+/*  
+
+SDL2_rotozoom.c: rotozoomer, zoomer and shrinker for 32bit or 8bit surfaces
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL2_rotozoom_h
+#define _SDL2_rotozoom_h
+
+#include <math.h>
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_PI
+#define M_PI	3.1415926535897932384626433832795
+#endif
+
+#include "SDL.h"
+
+	/* ---- Defines */
+
+	/*!
+	\brief Disable anti-aliasing (no smoothing).
+	*/
+#define SMOOTHING_OFF		0
+
+	/*!
+	\brief Enable anti-aliasing (smoothing).
+	*/
+#define SMOOTHING_ON		1
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL2_GFX_DLL_IMPORT)
+#    define SDL2_ROTOZOOM_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL2_GFX_DLL_IMPORT
+#      define SDL2_ROTOZOOM_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL2_ROTOZOOM_SCOPE
+#  define SDL2_ROTOZOOM_SCOPE extern
+#endif
+
+	/* 
+
+	Rotozoom functions
+
+	*/
+
+	SDL2_ROTOZOOM_SCOPE SDL_Surface *rotozoomSurface(SDL_Surface * src, double angle, double zoom, int smooth);
+
+	SDL2_ROTOZOOM_SCOPE SDL_Surface *rotozoomSurfaceXY
+		(SDL_Surface * src, double angle, double zoomx, double zoomy, int smooth);
+
+
+	SDL2_ROTOZOOM_SCOPE void rotozoomSurfaceSize(int width, int height, double angle, double zoom, int *dstwidth,
+		int *dstheight);
+
+	SDL2_ROTOZOOM_SCOPE void rotozoomSurfaceSizeXY
+		(int width, int height, double angle, double zoomx, double zoomy, 
+		int *dstwidth, int *dstheight);
+
+	/* 
+
+	Zooming functions
+
+	*/
+
+	SDL2_ROTOZOOM_SCOPE SDL_Surface *zoomSurface(SDL_Surface * src, double zoomx, double zoomy, int smooth);
+
+	SDL2_ROTOZOOM_SCOPE void zoomSurfaceSize(int width, int height, double zoomx, double zoomy, int *dstwidth, int *dstheight);
+
+	/* 
+
+	Shrinking functions
+
+	*/     
+
+	SDL2_ROTOZOOM_SCOPE SDL_Surface *shrinkSurface(SDL_Surface * src, int factorx, int factory);
+
+	/* 
+
+	Specialized rotation functions
+
+	*/
+
+	SDL2_ROTOZOOM_SCOPE SDL_Surface* rotateSurface90Degrees(SDL_Surface* src, int numClockwiseTurns);
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL2_rotozoom_h */
diff --git a/lib/sdl2_gfx/src/SDL2_framerate.c b/lib/sdl2_gfx/src/SDL2_framerate.c
new file mode 100644
index 0000000..ff912e3
--- /dev/null
+++ b/lib/sdl2_gfx/src/SDL2_framerate.c
@@ -0,0 +1,189 @@
+/*
+
+SDL2_framerate.c: framerate manager
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#include "SDL2_framerate.h"
+
+/*!
+\brief Internal wrapper to SDL_GetTicks that ensures a non-zero return value.
+
+\return The tick count.
+*/
+Uint32 _getTicks()
+{
+	Uint32 ticks = SDL_GetTicks();
+
+	/* 
+	* Since baseticks!=0 is used to track initialization
+	* we need to ensure that the tick count is always >0 
+	* since SDL_GetTicks may not have incremented yet and
+	* return 0 depending on the timing of the calls.
+	*/
+	if (ticks == 0) {
+		return 1;
+	} else {
+		return ticks;
+	}
+}
+
+/*!
+\brief Initialize the framerate manager.
+
+Initialize the framerate manager, set default framerate of 30Hz and
+reset delay interpolation.
+
+\param manager Pointer to the framerate manager.
+*/
+void SDL_initFramerate(FPSmanager * manager)
+{
+	/*
+	* Store some sane values 
+	*/
+	manager->framecount = 0;
+	manager->rate = FPS_DEFAULT;
+	manager->rateticks = (1000.0f / (float) FPS_DEFAULT);
+	manager->baseticks = _getTicks();
+	manager->lastticks = manager->baseticks;
+
+}
+
+/*!
+\brief Set the framerate in Hz 
+
+Sets a new framerate for the manager and reset delay interpolation.
+Rate values must be between FPS_LOWER_LIMIT and FPS_UPPER_LIMIT inclusive to be accepted.
+
+\param manager Pointer to the framerate manager.
+\param rate The new framerate in Hz (frames per second).
+
+\return 0 for sucess and -1 for error.
+*/
+int SDL_setFramerate(FPSmanager * manager, Uint32 rate)
+{
+	if ((rate >= FPS_LOWER_LIMIT) && (rate <= FPS_UPPER_LIMIT)) {
+		manager->framecount = 0;
+		manager->rate = rate;
+		manager->rateticks = (1000.0f / (float) rate);
+		return (0);
+	} else {
+		return (-1);
+	}
+}
+
+/*!
+\brief Return the current target framerate in Hz 
+
+Get the currently set framerate of the manager.
+
+\param manager Pointer to the framerate manager.
+
+\return Current framerate in Hz or -1 for error.
+*/
+int SDL_getFramerate(FPSmanager * manager)
+{
+	if (manager == NULL) {
+		return (-1);
+	} else {
+		return ((int)manager->rate);
+	}
+}
+
+/*!
+\brief Return the current framecount.
+
+Get the current framecount from the framerate manager. 
+A frame is counted each time SDL_framerateDelay is called.
+
+\param manager Pointer to the framerate manager.
+
+\return Current frame count or -1 for error.
+*/
+int SDL_getFramecount(FPSmanager * manager)
+{
+	if (manager == NULL) {
+		return (-1);
+	} else {
+		return ((int)manager->framecount);
+	}
+}
+
+/*!
+\brief Delay execution to maintain a constant framerate and calculate fps.
+
+Generate a delay to accomodate currently set framerate. Call once in the
+graphics/rendering loop. If the computer cannot keep up with the rate (i.e.
+drawing too slow), the delay is zero and the delay interpolation is reset.
+
+\param manager Pointer to the framerate manager.
+
+\return The time that passed since the last call to the function in ms. May return 0.
+*/
+Uint32 SDL_framerateDelay(FPSmanager * manager)
+{
+	Uint32 current_ticks;
+	Uint32 target_ticks;
+	Uint32 the_delay;
+	Uint32 time_passed = 0;
+
+	/*
+	* No manager, no delay
+	*/
+	if (manager == NULL) {
+		return 0;
+	}
+
+	/*
+	* Initialize uninitialized manager 
+	*/
+	if (manager->baseticks == 0) {
+		SDL_initFramerate(manager);
+	}
+
+	/*
+	* Next frame 
+	*/
+	manager->framecount++;
+
+	/*
+	* Get/calc ticks 
+	*/
+	current_ticks = _getTicks();
+	time_passed = current_ticks - manager->lastticks;
+	manager->lastticks = current_ticks;
+	target_ticks = manager->baseticks + (Uint32) ((float) manager->framecount * manager->rateticks);
+
+	if (current_ticks <= target_ticks) {
+		the_delay = target_ticks - current_ticks;
+		SDL_Delay(the_delay);
+	} else {
+		manager->framecount = 0;
+		manager->baseticks = _getTicks();
+	}
+
+	return time_passed;
+}
diff --git a/lib/sdl2_gfx/src/SDL2_gfxPrimitives.c b/lib/sdl2_gfx/src/SDL2_gfxPrimitives.c
new file mode 100644
index 0000000..4b57c2c
--- /dev/null
+++ b/lib/sdl2_gfx/src/SDL2_gfxPrimitives.c
@@ -0,0 +1,3790 @@
+/* 
+
+SDL2_gfxPrimitives.c: graphics primitives for SDL2 renderers
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include "SDL2_gfxPrimitives.h"
+#include "SDL2_rotozoom.h"
+#include "SDL2_gfxPrimitives_font.h"
+
+/* ---- Structures */
+
+/*!
+\brief The structure passed to the internal Bresenham iterator.
+*/
+typedef struct {
+	Sint16 x, y;
+	int dx, dy, s1, s2, swapdir, error;
+	Uint32 count;
+} SDL2_gfxBresenhamIterator;
+
+/*!
+\brief The structure passed to the internal Murphy iterator.
+*/
+typedef struct {
+	SDL_Renderer *renderer;
+	int u, v;		/* delta x , delta y */
+	int ku, kt, kv, kd;	/* loop constants */
+	int oct2;
+	int quad4;
+	Sint16 last1x, last1y, last2x, last2y, first1x, first1y, first2x, first2y, tempx, tempy;
+} SDL2_gfxMurphyIterator;
+
+/* ---- Pixel */
+
+/*!
+\brief Draw pixel  in currently set color.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the pixel.
+\param y Y (vertical) coordinate of the pixel.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pixel(SDL_Renderer *renderer, Sint16 x, Sint16 y)
+{
+	return SDL_RenderDrawPoint(renderer, x, y);
+}
+
+/*!
+\brief Draw pixel with blending enabled if a<255.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the pixel.
+\param y Y (vertical) coordinate of the pixel.
+\param color The color value of the pixel to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pixelColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return pixelRGBA(renderer, x, y, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw pixel with blending enabled if a<255.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the pixel.
+\param y Y (vertical) coordinate of the pixel.
+\param r The red color value of the pixel to draw. 
+\param g The green color value of the pixel to draw.
+\param b The blue color value of the pixel to draw.
+\param a The alpha value of the pixel to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pixelRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+	result |= SDL_RenderDrawPoint(renderer, x, y);
+	return result;
+}
+
+/*!
+\brief Draw pixel with blending enabled and using alpha weight on color.
+
+\param renderer The renderer to draw on.
+\param x The horizontal coordinate of the pixel.
+\param y The vertical position of the pixel.
+\param r The red color value of the pixel to draw. 
+\param g The green color value of the pixel to draw.
+\param b The blue color value of the pixel to draw.
+\param a The alpha value of the pixel to draw.
+\param weight The weight multiplied into the alpha value of the pixel.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pixelRGBAWeight(SDL_Renderer * renderer, Sint16 x, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a, Uint32 weight)
+{
+	/*
+	* Modify Alpha by weight 
+	*/
+	Uint32 ax = a;
+	ax = ((ax * weight) >> 8);
+	if (ax > 255) {
+		a = 255;
+	} else {
+		a = (Uint8)(ax & 0x000000ff);
+	}
+
+	return pixelRGBA(renderer, x, y, r, g, b, a);
+}
+
+/* ---- Hline */
+
+/*!
+\brief Draw horizontal line in currently set color
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int hline(SDL_Renderer * renderer, Sint16 x1, Sint16 x2, Sint16 y)
+{
+	return SDL_RenderDrawLine(renderer, x1, y, x2, y);;
+}
+
+
+/*!
+\brief Draw horizontal line with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int hlineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 x2, Sint16 y, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return hlineRGBA(renderer, x1, x2, y, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw horizontal line with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param r The red value of the line to draw. 
+\param g The green value of the line to draw. 
+\param b The blue value of the line to draw. 
+\param a The alpha value of the line to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int hlineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 x2, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+	result |= SDL_RenderDrawLine(renderer, x1, y, x2, y);
+	return result;
+}
+
+/* ---- Vline */
+
+/*!
+\brief Draw vertical line in currently set color
+
+\param renderer The renderer to draw on.
+\param x X coordinate of points of the line.
+\param y1 Y coordinate of the first point (i.e. top) of the line.
+\param y2 Y coordinate of the second point (i.e. bottom) of the line.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int vline(SDL_Renderer * renderer, Sint16 x, Sint16 y1, Sint16 y2)
+{
+	return SDL_RenderDrawLine(renderer, x, y1, x, y2);;
+}
+
+/*!
+\brief Draw vertical line with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the points of the line.
+\param y1 Y coordinate of the first point (i.e. top) of the line.
+\param y2 Y coordinate of the second point (i.e. bottom) of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int vlineColor(SDL_Renderer * renderer, Sint16 x, Sint16 y1, Sint16 y2, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return vlineRGBA(renderer, x, y1, y2, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw vertical line with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the points of the line.
+\param y1 Y coordinate of the first point (i.e. top) of the line.
+\param y2 Y coordinate of the second point (i.e. bottom) of the line.
+\param r The red value of the line to draw. 
+\param g The green value of the line to draw. 
+\param b The blue value of the line to draw. 
+\param a The alpha value of the line to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int vlineRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y1, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+	result |= SDL_RenderDrawLine(renderer, x, y1, x, y2);
+	return result;
+}
+
+/* ---- Rectangle */
+
+/*!
+\brief Draw rectangle with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param color The color value of the rectangle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int rectangleColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return rectangleRGBA(renderer, x1, y1, x2, y2, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw rectangle with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param r The red value of the rectangle to draw. 
+\param g The green value of the rectangle to draw. 
+\param b The blue value of the rectangle to draw. 
+\param a The alpha value of the rectangle to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int rectangleRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result;
+	Sint16 tmp;
+	SDL_Rect rect;
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+		} else {
+			return (vlineRGBA(renderer, x1, y1, y2, r, g, b, a));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineRGBA(renderer, x1, x2, y1, r, g, b, a));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/* 
+	* Create destination rect
+	*/	
+	rect.x = x1;
+	rect.y = y1;
+	rect.w = x2 - x1;
+	rect.h = y2 - y1;
+	
+	/*
+	* Draw
+	*/
+	result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);	
+	result |= SDL_RenderDrawRect(renderer, &rect);
+	return result;
+}
+
+/* ---- Rounded Rectangle */
+
+/*!
+\brief Draw rounded-corner rectangle with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param rad The radius of the corner arc.
+\param color The color value of the rectangle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int roundedRectangleColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return roundedRectangleRGBA(renderer, x1, y1, x2, y2, rad, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw rounded-corner rectangle with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param rad The radius of the corner arc.
+\param r The red value of the rectangle to draw. 
+\param g The green value of the rectangle to draw. 
+\param b The blue value of the rectangle to draw. 
+\param a The alpha value of the rectangle to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int roundedRectangleRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result = 0;
+	Sint16 tmp;
+	Sint16 w, h;
+	Sint16 xx1, xx2;
+	Sint16 yy1, yy2;
+	
+	/*
+	* Check renderer
+	*/
+	if (renderer == NULL)
+	{
+		return -1;
+	}
+
+	/*
+	* Check radius vor valid range
+	*/
+	if (rad < 0) {
+		return -1;
+	}
+
+	/*
+	* Special case - no rounding
+	*/
+	if (rad <= 1) {
+		return rectangleRGBA(renderer, x1, y1, x2, y2, r, g, b, a);
+	}
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+		} else {
+			return (vlineRGBA(renderer, x1, y1, y2, r, g, b, a));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineRGBA(renderer, x1, x2, y1, r, g, b, a));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/*
+	* Calculate width&height 
+	*/
+	w = x2 - x1;
+	h = y2 - y1;
+
+	/*
+	* Maybe adjust radius
+	*/
+	if ((rad * 2) > w)  
+	{
+		rad = w / 2;
+	}
+	if ((rad * 2) > h)
+	{
+		rad = h / 2;
+	}
+
+	/*
+	* Draw corners
+	*/
+	xx1 = x1 + rad;
+	xx2 = x2 - rad;
+	yy1 = y1 + rad;
+	yy2 = y2 - rad;
+	result |= arcRGBA(renderer, xx1, yy1, rad, 180, 270, r, g, b, a);
+	result |= arcRGBA(renderer, xx2, yy1, rad, 270, 360, r, g, b, a);
+	result |= arcRGBA(renderer, xx1, yy2, rad,  90, 180, r, g, b, a);
+	result |= arcRGBA(renderer, xx2, yy2, rad,   0,  90, r, g, b, a);
+
+	/*
+	* Draw lines
+	*/
+	if (xx1 <= xx2) {
+		result |= hlineRGBA(renderer, xx1, xx2, y1, r, g, b, a);
+		result |= hlineRGBA(renderer, xx1, xx2, y2, r, g, b, a);
+	}
+	if (yy1 <= yy2) {
+		result |= vlineRGBA(renderer, x1, yy1, yy2, r, g, b, a);
+		result |= vlineRGBA(renderer, x2, yy1, yy2, r, g, b, a);
+	}
+
+	return result;
+}
+
+/* ---- Rounded Box */
+
+/*!
+\brief Draw rounded-corner box (filled rectangle) with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param rad The radius of the corner arcs of the box.
+\param color The color value of the box to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int roundedBoxColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return roundedBoxRGBA(renderer, x1, y1, x2, y2, rad, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw rounded-corner box (filled rectangle) with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param rad The radius of the corner arcs of the box.
+\param r The red value of the box to draw. 
+\param g The green value of the box to draw. 
+\param b The blue value of the box to draw. 
+\param a The alpha value of the box to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int roundedBoxRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2,
+	Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result;
+	Sint16 w, h, r2, tmp;
+	Sint16 cx = 0;
+	Sint16 cy = rad;
+	Sint16 ocx = (Sint16) 0xffff;
+	Sint16 ocy = (Sint16) 0xffff;
+	Sint16 df = 1 - rad;
+	Sint16 d_e = 3;
+	Sint16 d_se = -2 * rad + 5;
+	Sint16 xpcx, xmcx, xpcy, xmcy;
+	Sint16 ypcy, ymcy, ypcx, ymcx;
+	Sint16 x, y, dx, dy;
+
+	/* 
+	* Check destination renderer 
+	*/
+	if (renderer == NULL)
+	{
+		return -1;
+	}
+
+	/*
+	* Check radius vor valid range
+	*/
+	if (rad < 0) {
+		return -1;
+	}
+
+	/*
+	* Special case - no rounding
+	*/
+	if (rad <= 1) {
+		return boxRGBA(renderer, x1, y1, x2, y2, r, g, b, a);
+	}
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+		} else {
+			return (vlineRGBA(renderer, x1, y1, y2, r, g, b, a));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineRGBA(renderer, x1, x2, y1, r, g, b, a));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/*
+	* Calculate width&height 
+	*/
+	w = x2 - x1 + 1;
+	h = y2 - y1 + 1;
+
+	/*
+	* Maybe adjust radius
+	*/
+	r2 = rad + rad;
+	if (r2 > w)  
+	{
+		rad = w / 2;
+		r2 = rad + rad;
+	}
+	if (r2 > h)
+	{
+		rad = h / 2;
+	}
+
+	/* Setup filled circle drawing for corners */
+	x = x1 + rad;
+	y = y1 + rad;
+	dx = x2 - x1 - rad - rad;
+	dy = y2 - y1 - rad - rad;
+
+	/*
+	* Set color
+	*/
+	result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+
+	/*
+	* Draw corners
+	*/
+	do {
+		xpcx = x + cx;
+		xmcx = x - cx;
+		xpcy = x + cy;
+		xmcy = x - cy;
+		if (ocy != cy) {
+			if (cy > 0) {
+				ypcy = y + cy;
+				ymcy = y - cy;
+				result |= hline(renderer, xmcx, xpcx + dx, ypcy + dy);
+				result |= hline(renderer, xmcx, xpcx + dx, ymcy);
+			} else {
+				result |= hline(renderer, xmcx, xpcx + dx, y);
+			}
+			ocy = cy;
+		}
+		if (ocx != cx) {
+			if (cx != cy) {
+				if (cx > 0) {
+					ypcx = y + cx;
+					ymcx = y - cx;
+					result |= hline(renderer, xmcy, xpcy + dx, ymcx);
+					result |= hline(renderer, xmcy, xpcy + dx, ypcx + dy);
+				} else {
+					result |= hline(renderer, xmcy, xpcy + dx, y);
+				}
+			}
+			ocx = cx;
+		}
+
+		/*
+		* Update 
+		*/
+		if (df < 0) {
+			df += d_e;
+			d_e += 2;
+			d_se += 2;
+		} else {
+			df += d_se;
+			d_e += 2;
+			d_se += 4;
+			cy--;
+		}
+		cx++;
+	} while (cx <= cy);
+
+	/* Inside */
+	if (dx > 0 && dy > 0) {
+		result |= boxRGBA(renderer, x1, y1 + rad + 1, x2, y2 - rad, r, g, b, a);
+	}
+
+	return (result);
+}
+
+/* ---- Box */
+
+/*!
+\brief Draw box (filled rectangle) with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param color The color value of the box to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int boxColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return boxRGBA(renderer, x1, y1, x2, y2, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw box (filled rectangle) with blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param r The red value of the box to draw. 
+\param g The green value of the box to draw. 
+\param b The blue value of the box to draw. 
+\param a The alpha value of the box to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int boxRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result;
+	Sint16 tmp;
+	SDL_Rect rect;
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+		} else {
+			return (vlineRGBA(renderer, x1, y1, y2, r, g, b, a));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineRGBA(renderer, x1, x2, y1, r, g, b, a));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/* 
+	* Create destination rect
+	*/	
+	rect.x = x1;
+	rect.y = y1;
+	rect.w = x2 - x1 + 1;
+	rect.h = y2 - y1 + 1;
+	
+	/*
+	* Draw
+	*/
+	result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);	
+	result |= SDL_RenderFillRect(renderer, &rect);
+	return result;
+}
+
+/* ----- Line */
+
+/*!
+\brief Draw line with alpha blending using the currently set color.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int line(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2)
+{
+	/*
+	* Draw
+	*/
+	return SDL_RenderDrawLine(renderer, x1, y1, x2, y2);
+}
+
+/*!
+\brief Draw line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the seond point of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int lineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return lineRGBA(renderer, x1, y1, x2, y2, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+\param r The red value of the line to draw. 
+\param g The green value of the line to draw. 
+\param b The blue value of the line to draw. 
+\param a The alpha value of the line to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int lineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw
+	*/
+	int result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);	
+	result |= SDL_RenderDrawLine(renderer, x1, y1, x2, y2);
+	return result;
+}
+
+/* ---- AA Line */
+
+#define AAlevels 256
+#define AAbits 8
+
+/*!
+\brief Internal function to draw anti-aliased line with alpha blending and endpoint control.
+
+This implementation of the Wu antialiasing code is based on Mike Abrash's
+DDJ article which was reprinted as Chapter 42 of his Graphics Programming
+Black Book, but has been optimized to work with SDL and utilizes 32-bit
+fixed-point arithmetic by A. Schiffler. The endpoint control allows the
+supression to draw the last pixel useful for rendering continous aa-lines
+with alpha<255.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the aa-line.
+\param y1 Y coordinate of the first point of the aa-line.
+\param x2 X coordinate of the second point of the aa-line.
+\param y2 Y coordinate of the second point of the aa-line.
+\param r The red value of the aa-line to draw. 
+\param g The green value of the aa-line to draw. 
+\param b The blue value of the aa-line to draw. 
+\param a The alpha value of the aa-line to draw.
+\param draw_endpoint Flag indicating if the endpoint should be drawn; draw if non-zero.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _aalineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a, int draw_endpoint)
+{
+	Sint32 xx0, yy0, xx1, yy1;
+	int result;
+	Uint32 intshift, erracc, erradj;
+	Uint32 erracctmp, wgt, wgtcompmask;
+	int dx, dy, tmp, xdir, y0p1, x0pxdir;
+
+	/*
+	* Keep on working with 32bit numbers 
+	*/
+	xx0 = x1;
+	yy0 = y1;
+	xx1 = x2;
+	yy1 = y2;
+
+	/*
+	* Reorder points to make dy positive 
+	*/
+	if (yy0 > yy1) {
+		tmp = yy0;
+		yy0 = yy1;
+		yy1 = tmp;
+		tmp = xx0;
+		xx0 = xx1;
+		xx1 = tmp;
+	}
+
+	/*
+	* Calculate distance 
+	*/
+	dx = xx1 - xx0;
+	dy = yy1 - yy0;
+
+	/*
+	* Adjust for negative dx and set xdir 
+	*/
+	if (dx >= 0) {
+		xdir = 1;
+	} else {
+		xdir = -1;
+		dx = (-dx);
+	}
+	
+	/*
+	* Check for special cases 
+	*/
+	if (dx == 0) {
+		/*
+		* Vertical line 
+		*/
+		if (draw_endpoint)
+		{
+			return (vlineRGBA(renderer, x1, y1, y2, r, g, b, a));
+		} else {
+			if (dy > 0) {
+				return (vlineRGBA(renderer, x1, yy0, yy0+dy, r, g, b, a));
+			} else {
+				return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+			}
+		}
+	} else if (dy == 0) {
+		/*
+		* Horizontal line 
+		*/
+		if (draw_endpoint)
+		{
+			return (hlineRGBA(renderer, x1, x2, y1, r, g, b, a));
+		} else {
+			if (dx > 0) {
+				return (hlineRGBA(renderer, xx0, xx0+dx, y1, r, g, b, a));
+			} else {
+				return (pixelRGBA(renderer, x1, y1, r, g, b, a));
+			}
+		}
+	} else if ((dx == dy) && (draw_endpoint)) {
+		/*
+		* Diagonal line (with endpoint)
+		*/
+		return (lineRGBA(renderer, x1, y1, x2, y2,  r, g, b, a));
+	}
+
+
+	/*
+	* Line is not horizontal, vertical or diagonal (with endpoint)
+	*/
+	result = 0;
+
+	/*
+	* Zero accumulator 
+	*/
+	erracc = 0;
+
+	/*
+	* # of bits by which to shift erracc to get intensity level 
+	*/
+	intshift = 32 - AAbits;
+
+	/*
+	* Mask used to flip all bits in an intensity weighting 
+	*/
+	wgtcompmask = AAlevels - 1;
+
+	/*
+	* Draw the initial pixel in the foreground color 
+	*/
+	result |= pixelRGBA(renderer, x1, y1, r, g, b, a);
+
+	/*
+	* x-major or y-major? 
+	*/
+	if (dy > dx) {
+
+		/*
+		* y-major.  Calculate 16-bit fixed point fractional part of a pixel that
+		* X advances every time Y advances 1 pixel, truncating the result so that
+		* we won't overrun the endpoint along the X axis 
+		*/
+		/*
+		* Not-so-portable version: erradj = ((Uint64)dx << 32) / (Uint64)dy; 
+		*/
+		erradj = ((dx << 16) / dy) << 16;
+
+		/*
+		* draw all pixels other than the first and last 
+		*/
+		x0pxdir = xx0 + xdir;
+		while (--dy) {
+			erracctmp = erracc;
+			erracc += erradj;
+			if (erracc <= erracctmp) {
+				/*
+				* rollover in error accumulator, x coord advances 
+				*/
+				xx0 = x0pxdir;
+				x0pxdir += xdir;
+			}
+			yy0++;		/* y-major so always advance Y */
+
+			/*
+			* the AAbits most significant bits of erracc give us the intensity
+			* weighting for this pixel, and the complement of the weighting for
+			* the paired pixel. 
+			*/
+			wgt = (erracc >> intshift) & 255;
+			result |= pixelRGBAWeight (renderer, xx0, yy0, r, g, b, a, 255 - wgt);
+			result |= pixelRGBAWeight (renderer, x0pxdir, yy0, r, g, b, a, wgt);
+		}
+
+	} else {
+
+		/*
+		* x-major line.  Calculate 16-bit fixed-point fractional part of a pixel
+		* that Y advances each time X advances 1 pixel, truncating the result so
+		* that we won't overrun the endpoint along the X axis. 
+		*/
+		/*
+		* Not-so-portable version: erradj = ((Uint64)dy << 32) / (Uint64)dx; 
+		*/
+		erradj = ((dy << 16) / dx) << 16;
+
+		/*
+		* draw all pixels other than the first and last 
+		*/
+		y0p1 = yy0 + 1;
+		while (--dx) {
+
+			erracctmp = erracc;
+			erracc += erradj;
+			if (erracc <= erracctmp) {
+				/*
+				* Accumulator turned over, advance y 
+				*/
+				yy0 = y0p1;
+				y0p1++;
+			}
+			xx0 += xdir;	/* x-major so always advance X */
+			/*
+			* the AAbits most significant bits of erracc give us the intensity
+			* weighting for this pixel, and the complement of the weighting for
+			* the paired pixel. 
+			*/
+			wgt = (erracc >> intshift) & 255;
+			result |= pixelRGBAWeight (renderer, xx0, yy0, r, g, b, a, 255 - wgt);
+			result |= pixelRGBAWeight (renderer, xx0, y0p1, r, g, b, a, wgt);
+		}
+	}
+
+	/*
+	* Do we have to draw the endpoint 
+	*/
+	if (draw_endpoint) {
+		/*
+		* Draw final pixel, always exactly intersected by the line and doesn't
+		* need to be weighted. 
+		*/
+		result |= pixelRGBA (renderer, x2, y2, r, g, b, a);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw anti-aliased line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the aa-line.
+\param y1 Y coordinate of the first point of the aa-line.
+\param x2 X coordinate of the second point of the aa-line.
+\param y2 Y coordinate of the second point of the aa-line.
+\param color The color value of the aa-line to draw (0xRRGGBBAA).
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aalineColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return _aalineRGBA(renderer, x1, y1, x2, y2, c[0], c[1], c[2], c[3], 1);
+}
+
+/*!
+\brief Draw anti-aliased line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the aa-line.
+\param y1 Y coordinate of the first point of the aa-line.
+\param x2 X coordinate of the second point of the aa-line.
+\param y2 Y coordinate of the second point of the aa-line.
+\param r The red value of the aa-line to draw. 
+\param g The green value of the aa-line to draw. 
+\param b The blue value of the aa-line to draw. 
+\param a The alpha value of the aa-line to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aalineRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _aalineRGBA(renderer, x1, y1, x2, y2, r, g, b, a, 1);
+}
+
+/* ----- Circle */
+
+/*!
+\brief Draw circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the circle.
+\param y Y coordinate of the center of the circle.
+\param rad Radius in pixels of the circle.
+\param color The color value of the circle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int circleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return ellipseRGBA(renderer, x, y, rad, rad, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the circle.
+\param y Y coordinate of the center of the circle.
+\param rad Radius in pixels of the circle.
+\param r The red value of the circle to draw. 
+\param g The green value of the circle to draw. 
+\param b The blue value of the circle to draw. 
+\param a The alpha value of the circle to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int circleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return ellipseRGBA(renderer, x, y, rad, rad, r, g, b, a);
+}
+
+/* ----- Arc */
+
+/*!
+\brief Arc with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the arc.
+\param y Y coordinate of the center of the arc.
+\param rad Radius in pixels of the arc.
+\param start Starting radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param end Ending radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param color The color value of the arc to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int arcColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return arcRGBA(renderer, x, y, rad, start, end, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Arc with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the arc.
+\param y Y coordinate of the center of the arc.
+\param rad Radius in pixels of the arc.
+\param start Starting radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param end Ending radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param r The red value of the arc to draw. 
+\param g The green value of the arc to draw. 
+\param b The blue value of the arc to draw. 
+\param a The alpha value of the arc to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+/* TODO: rewrite algorithm; arc endpoints are not always drawn */
+int arcRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result;
+	Sint16 cx = 0;
+	Sint16 cy = rad;
+	Sint16 df = 1 - rad;
+	Sint16 d_e = 3;
+	Sint16 d_se = -2 * rad + 5;
+	Sint16 xpcx, xmcx, xpcy, xmcy;
+	Sint16 ypcy, ymcy, ypcx, ymcx;
+	Uint8 drawoct;
+	int startoct, endoct, oct, stopval_start = 0, stopval_end = 0;
+	double dstart, dend, temp = 0.;
+
+	/*
+	* Sanity check radius 
+	*/
+	if (rad < 0) {
+		return (-1);
+	}
+
+	/*
+	* Special case for rad=0 - draw a point 
+	*/
+	if (rad == 0) {
+		return (pixelRGBA(renderer, x, y, r, g, b, a));
+	}
+
+	/*
+	 Octant labeling
+	      
+	  \ 5 | 6 /
+	   \  |  /
+	  4 \ | / 7
+	     \|/
+	------+------ +x
+	     /|\
+	  3 / | \ 0
+	   /  |  \
+	  / 2 | 1 \
+	      +y
+
+	 Initially reset bitmask to 0x00000000
+	 the set whether or not to keep drawing a given octant.
+	 For example: 0x00111100 means we're drawing in octants 2-5
+	*/
+	drawoct = 0; 
+
+	/*
+	* Fixup angles
+	*/
+	start %= 360;
+	end %= 360;
+	/* 0 <= start & end < 360; note that sometimes start > end - if so, arc goes back through 0. */
+	while (start < 0) start += 360;
+	while (end < 0) end += 360;
+	start %= 360;
+	end %= 360;
+
+	/* now, we find which octants we're drawing in. */
+	startoct = start / 45;
+	endoct = end / 45;
+	oct = startoct - 1;
+
+	/* stopval_start, stopval_end; what values of cx to stop at. */
+	do {
+		oct = (oct + 1) % 8;
+
+		if (oct == startoct) {
+			/* need to compute stopval_start for this octant.  Look at picture above if this is unclear */
+			dstart = (double)start;
+			switch (oct) 
+			{
+			case 0:
+			case 3:
+				temp = sin(dstart * M_PI / 180.);
+				break;
+			case 1:
+			case 6:
+				temp = cos(dstart * M_PI / 180.);
+				break;
+			case 2:
+			case 5:
+				temp = -cos(dstart * M_PI / 180.);
+				break;
+			case 4:
+			case 7:
+				temp = -sin(dstart * M_PI / 180.);
+				break;
+			}
+			temp *= rad;
+			stopval_start = (int)temp;
+
+			/* 
+			This isn't arbitrary, but requires graph paper to explain well.
+			The basic idea is that we're always changing drawoct after we draw, so we
+			stop immediately after we render the last sensible pixel at x = ((int)temp).
+			and whether to draw in this octant initially
+			*/
+			if (oct % 2) drawoct |= (1 << oct);			/* this is basically like saying drawoct[oct] = true, if drawoct were a bool array */
+			else		 drawoct &= 255 - (1 << oct);	/* this is basically like saying drawoct[oct] = false */
+		}
+		if (oct == endoct) {
+			/* need to compute stopval_end for this octant */
+			dend = (double)end;
+			switch (oct)
+			{
+			case 0:
+			case 3:
+				temp = sin(dend * M_PI / 180);
+				break;
+			case 1:
+			case 6:
+				temp = cos(dend * M_PI / 180);
+				break;
+			case 2:
+			case 5:
+				temp = -cos(dend * M_PI / 180);
+				break;
+			case 4:
+			case 7:
+				temp = -sin(dend * M_PI / 180);
+				break;
+			}
+			temp *= rad;
+			stopval_end = (int)temp;
+
+			/* and whether to draw in this octant initially */
+			if (startoct == endoct)	{
+				/* note:      we start drawing, stop, then start again in this case */
+				/* otherwise: we only draw in this octant, so initialize it to false, it will get set back to true */
+				if (start > end) {
+					/* unfortunately, if we're in the same octant and need to draw over the whole circle, */
+					/* we need to set the rest to true, because the while loop will end at the bottom. */
+					drawoct = 255;
+				} else {
+					drawoct &= 255 - (1 << oct);
+				}
+			} 
+			else if (oct % 2) drawoct &= 255 - (1 << oct);
+			else			  drawoct |= (1 << oct);
+		} else if (oct != startoct) { /* already verified that it's != endoct */
+			drawoct |= (1 << oct); /* draw this entire segment */
+		}
+	} while (oct != endoct);
+
+	/* so now we have what octants to draw and when to draw them. all that's left is the actual raster code. */
+
+	/*
+	* Set color 
+	*/
+	result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+
+	/*
+	* Draw arc 
+	*/
+	do {
+		ypcy = y + cy;
+		ymcy = y - cy;
+		if (cx > 0) {
+			xpcx = x + cx;
+			xmcx = x - cx;
+
+			/* always check if we're drawing a certain octant before adding a pixel to that octant. */
+			if (drawoct & 4)  result |= pixel(renderer, xmcx, ypcy);
+			if (drawoct & 2)  result |= pixel(renderer, xpcx, ypcy);
+			if (drawoct & 32) result |= pixel(renderer, xmcx, ymcy);
+			if (drawoct & 64) result |= pixel(renderer, xpcx, ymcy);
+		} else {
+			if (drawoct & 96) result |= pixel(renderer, x, ymcy);
+			if (drawoct & 6)  result |= pixel(renderer, x, ypcy);
+		}
+
+		xpcy = x + cy;
+		xmcy = x - cy;
+		if (cx > 0 && cx != cy) {
+			ypcx = y + cx;
+			ymcx = y - cx;
+			if (drawoct & 8)   result |= pixel(renderer, xmcy, ypcx);
+			if (drawoct & 1)   result |= pixel(renderer, xpcy, ypcx);
+			if (drawoct & 16)  result |= pixel(renderer, xmcy, ymcx);
+			if (drawoct & 128) result |= pixel(renderer, xpcy, ymcx);
+		} else if (cx == 0) {
+			if (drawoct & 24)  result |= pixel(renderer, xmcy, y);
+			if (drawoct & 129) result |= pixel(renderer, xpcy, y);
+		}
+
+		/*
+		* Update whether we're drawing an octant
+		*/
+		if (stopval_start == cx) {
+			/* works like an on-off switch. */  
+			/* This is just in case start & end are in the same octant. */
+			if (drawoct & (1 << startoct)) drawoct &= 255 - (1 << startoct);		
+			else						   drawoct |= (1 << startoct);
+		}
+		if (stopval_end == cx) {
+			if (drawoct & (1 << endoct)) drawoct &= 255 - (1 << endoct);
+			else						 drawoct |= (1 << endoct);
+		}
+
+		/*
+		* Update pixels
+		*/
+		if (df < 0) {
+			df += d_e;
+			d_e += 2;
+			d_se += 2;
+		} else {
+			df += d_se;
+			d_e += 2;
+			d_se += 4;
+			cy--;
+		}
+		cx++;
+	} while (cx <= cy);
+
+	return (result);
+}
+
+/* ----- AA Circle */
+
+/*!
+\brief Draw anti-aliased circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the aa-circle.
+\param y Y coordinate of the center of the aa-circle.
+\param rad Radius in pixels of the aa-circle.
+\param color The color value of the aa-circle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aacircleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return aaellipseRGBA(renderer, x, y, rad, rad, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw anti-aliased circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the aa-circle.
+\param y Y coordinate of the center of the aa-circle.
+\param rad Radius in pixels of the aa-circle.
+\param r The red value of the aa-circle to draw. 
+\param g The green value of the aa-circle to draw. 
+\param b The blue value of the aa-circle to draw. 
+\param a The alpha value of the aa-circle to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aacircleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return aaellipseRGBA(renderer, x, y, rad, rad, r, g, b, a);
+}
+
+/* ----- Ellipse */
+
+/*!
+\brief Internal function to draw pixels or lines in 4 quadrants.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the quadrant.
+\param y Y coordinate of the center of the quadrant.
+\param dx X offset in pixels of the corners of the quadrant.
+\param dy Y offset in pixels of the corners of the quadrant.
+\param f Flag indicating if the quadrant should be filled (1) or not (0).
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _drawQuadrants(SDL_Renderer * renderer,  Sint16 x, Sint16 y, Sint16 dx, Sint16 dy, Sint32 f)
+{
+	int result = 0;
+	Sint16 xpdx, xmdx;
+	Sint16 ypdy, ymdy;
+
+	if (dx == 0) {
+		if (dy == 0) {
+			result |= pixel(renderer, x, y);
+		} else {
+			ypdy = y + dy;
+			ymdy = y - dy;
+			if (f) {
+				result |= vline(renderer, x, ymdy, ypdy);
+			} else {
+				result |= pixel(renderer, x, ypdy);
+				result |= pixel(renderer, x, ymdy);
+			}
+		}
+	} else {	
+		xpdx = x + dx;
+		xmdx = x - dx;
+		ypdy = y + dy;
+		ymdy = y - dy;
+		if (f) {
+				result |= vline(renderer, xpdx, ymdy, ypdy);
+				result |= vline(renderer, xmdx, ymdy, ypdy);
+		} else {
+				result |= pixel(renderer, xpdx, ypdy);
+				result |= pixel(renderer, xmdx, ypdy);
+				result |= pixel(renderer, xpdx, ymdy);
+				result |= pixel(renderer, xmdx, ymdy);
+		}
+	}
+
+	return result;
+}
+
+/*!
+\brief Internal function to draw ellipse or filled ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the ellipse.
+\param y Y coordinate of the center of the ellipse.
+\param rx Horizontal radius in pixels of the ellipse.
+\param ry Vertical radius in pixels of the ellipse.
+\param r The red value of the ellipse to draw. 
+\param g The green value of the ellipse to draw. 
+\param b The blue value of the ellipse to draw. 
+\param a The alpha value of the ellipse to draw.
+\param f Flag indicating if the ellipse should be filled (1) or not (0).
+
+\returns Returns 0 on success, -1 on failure.
+*/
+#define ELLIPSE_OVERSCAN	4
+int _ellipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a, Sint32 f)
+{
+	int result;
+	Sint32 rx2, ry2, rx22, ry22; 
+    Sint32 error;
+    Sint32 curX, curY, curXp1, curYm1;
+	Sint32 scrX, scrY, oldX, oldY;
+    Sint32 deltaX, deltaY;
+
+	/*
+	* Sanity check radii 
+	*/
+	if ((rx < 0) || (ry < 0)) {
+		return (-1);
+	}
+
+	/*
+	* Set color
+	*/
+	result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+
+	/*
+	* Special cases for rx=0 and/or ry=0: draw a hline/vline/pixel 
+	*/
+	if (rx == 0) {
+		if (ry == 0) {
+			return (pixel(renderer, x, y));
+		} else {
+			return (vline(renderer, x, y - ry, y + ry));
+		}
+	} else {
+		if (ry == 0) {
+			return (hline(renderer, x - rx, x + rx, y));
+		}
+	}
+
+	/*
+	 * Top/bottom center points.
+	 */
+	oldX = scrX = 0;
+	oldY = scrY = ry;
+	result |= _drawQuadrants(renderer, x, y, 0, ry, f);
+
+	/* Midpoint ellipse algorithm with overdraw */
+	rx *= ELLIPSE_OVERSCAN;
+	ry *= ELLIPSE_OVERSCAN;
+	rx2 = rx * rx;
+	rx22 = rx2 + rx2;
+    ry2 = ry * ry;
+	ry22 = ry2 + ry2;
+    curX = 0;
+    curY = ry;
+    deltaX = 0;
+    deltaY = rx22 * curY;
+ 
+	/* Points in segment 1 */ 
+    error = ry2 - rx2 * ry + rx2 / 4;
+    while (deltaX <= deltaY)
+    {
+          curX++;
+          deltaX += ry22;
+ 
+          error +=  deltaX + ry2; 
+          if (error >= 0)
+          {
+               curY--;
+               deltaY -= rx22; 
+               error -= deltaY;
+          }
+
+		  scrX = curX/ELLIPSE_OVERSCAN;
+		  scrY = curY/ELLIPSE_OVERSCAN;
+		  if ((scrX != oldX && scrY == oldY) || (scrX != oldX && scrY != oldY)) {
+			result |= _drawQuadrants(renderer, x, y, scrX, scrY, f);
+			oldX = scrX;
+			oldY = scrY;
+		  }
+    }
+
+	/* Points in segment 2 */
+	if (curY > 0) 
+	{
+		curXp1 = curX + 1;
+		curYm1 = curY - 1;
+		error = ry2 * curX * curXp1 + ((ry2 + 3) / 4) + rx2 * curYm1 * curYm1 - rx2 * ry2;
+		while (curY > 0)
+		{
+			curY--;
+			deltaY -= rx22;
+
+			error += rx2;
+			error -= deltaY;
+ 
+			if (error <= 0) 
+			{
+               curX++;
+               deltaX += ry22;
+               error += deltaX;
+			}
+
+		    scrX = curX/ELLIPSE_OVERSCAN;
+		    scrY = curY/ELLIPSE_OVERSCAN;
+		    if ((scrX != oldX && scrY == oldY) || (scrX != oldX && scrY != oldY)) {
+				oldY--;
+				for (;oldY >= scrY; oldY--) {
+					result |= _drawQuadrants(renderer, x, y, scrX, oldY, f);
+					/* prevent overdraw */
+					if (f) {
+						oldY = scrY - 1;
+					}
+				}
+  				oldX = scrX;
+				oldY = scrY;
+		    }		
+		}
+
+		/* Remaining points in vertical */
+		if (!f) {
+			oldY--;
+			for (;oldY >= 0; oldY--) {
+				result |= _drawQuadrants(renderer, x, y, scrX, oldY, f);
+			}
+		}
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the ellipse.
+\param y Y coordinate of the center of the ellipse.
+\param rx Horizontal radius in pixels of the ellipse.
+\param ry Vertical radius in pixels of the ellipse.
+\param color The color value of the ellipse to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int ellipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return _ellipseRGBA(renderer, x, y, rx, ry, c[0], c[1], c[2], c[3], 0);
+}
+
+/*!
+\brief Draw ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the ellipse.
+\param y Y coordinate of the center of the ellipse.
+\param rx Horizontal radius in pixels of the ellipse.
+\param ry Vertical radius in pixels of the ellipse.
+\param r The red value of the ellipse to draw. 
+\param g The green value of the ellipse to draw. 
+\param b The blue value of the ellipse to draw. 
+\param a The alpha value of the ellipse to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int ellipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _ellipseRGBA(renderer, x, y, rx, ry, r, g, b, a, 0);
+}
+
+/* ----- Filled Circle */
+
+/*!
+\brief Draw filled circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled circle.
+\param y Y coordinate of the center of the filled circle.
+\param rad Radius in pixels of the filled circle.
+\param color The color value of the filled circle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledCircleColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return filledEllipseRGBA(renderer, x, y, rad, rad, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw filled circle with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled circle.
+\param y Y coordinate of the center of the filled circle.
+\param rad Radius in pixels of the filled circle.
+\param r The red value of the filled circle to draw. 
+\param g The green value of the filled circle to draw. 
+\param b The blue value of the filled circle to draw. 
+\param a The alpha value of the filled circle to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledCircleRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _ellipseRGBA(renderer, x, y, rad, rad, r, g ,b, a, 1);
+}
+
+
+/* ----- AA Ellipse */
+
+/* Windows targets do not have lrint, so provide a local inline version */
+#if defined(_MSC_VER)
+/* Detect 64bit and use intrinsic version */
+#ifdef _M_X64
+#include <emmintrin.h>
+static __inline long 
+	lrint(float f) 
+{
+	return _mm_cvtss_si32(_mm_load_ss(&f));
+}
+#elif defined(_M_IX86)
+__inline long int
+	lrint (double flt)
+{	
+	int intgr;
+	_asm
+	{
+		fld flt
+			fistp intgr
+	};
+	return intgr;
+}
+#elif defined(_M_ARM)
+#include <armintr.h>
+#pragma warning(push)
+#pragma warning(disable: 4716)
+__declspec(naked) long int
+	lrint (double flt)
+{
+	__emit(0xEC410B10); // fmdrr  d0, r0, r1
+	__emit(0xEEBD0B40); // ftosid s0, d0
+	__emit(0xEE100A10); // fmrs   r0, s0
+	__emit(0xE12FFF1E); // bx     lr
+}
+#pragma warning(pop)
+#else
+#error lrint needed for MSVC on non X86/AMD64/ARM targets.
+#endif
+#endif
+
+/*!
+\brief Draw anti-aliased ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the aa-ellipse.
+\param y Y coordinate of the center of the aa-ellipse.
+\param rx Horizontal radius in pixels of the aa-ellipse.
+\param ry Vertical radius in pixels of the aa-ellipse.
+\param color The color value of the aa-ellipse to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aaellipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return aaellipseRGBA(renderer, x, y, rx, ry, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw anti-aliased ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the aa-ellipse.
+\param y Y coordinate of the center of the aa-ellipse.
+\param rx Horizontal radius in pixels of the aa-ellipse.
+\param ry Vertical radius in pixels of the aa-ellipse.
+\param r The red value of the aa-ellipse to draw. 
+\param g The green value of the aa-ellipse to draw. 
+\param b The blue value of the aa-ellipse to draw. 
+\param a The alpha value of the aa-ellipse to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aaellipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result;
+	int i;
+	int a2, b2, ds, dt, dxt, t, s, d;
+	Sint16 xp, yp, xs, ys, dyt, od, xx, yy, xc2, yc2;
+	float cp;
+	double sab;
+	Uint8 weight, iweight;
+
+	/*
+	* Sanity check radii 
+	*/
+	if ((rx < 0) || (ry < 0)) {
+		return (-1);
+	}
+
+	/*
+	* Special cases for rx=0 and/or ry=0: draw a hline/vline/pixel 
+	*/
+	if (rx == 0) {
+		if (ry == 0) {
+			return (pixelRGBA(renderer, x, y, r, g, b, a));
+		} else {
+			return (vlineRGBA(renderer, x, y - ry, y + ry, r, g, b, a));
+		}
+	} else {
+		if (ry == 0) {
+			return (hlineRGBA(renderer, x - rx, x + rx, y, r, g, b, a));
+		}
+	}
+
+	/* Variable setup */
+	a2 = rx * rx;
+	b2 = ry * ry;
+
+	ds = 2 * a2;
+	dt = 2 * b2;
+
+	xc2 = 2 * x;
+	yc2 = 2 * y;
+
+	sab = sqrt((double)(a2 + b2));
+	od = (Sint16)lrint(sab*0.01) + 1; /* introduce some overdraw */
+	dxt = (Sint16)lrint((double)a2 / sab) + od;
+
+	t = 0;
+	s = -2 * a2 * ry;
+	d = 0;
+
+	xp = x;
+	yp = y - ry;
+
+	/* Draw */
+	result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+
+	/* "End points" */
+	result |= pixelRGBA(renderer, xp, yp, r, g, b, a);
+	result |= pixelRGBA(renderer, xc2 - xp, yp, r, g, b, a);
+	result |= pixelRGBA(renderer, xp, yc2 - yp, r, g, b, a);
+	result |= pixelRGBA(renderer, xc2 - xp, yc2 - yp, r, g, b, a);
+
+	for (i = 1; i <= dxt; i++) {
+		xp--;
+		d += t - b2;
+
+		if (d >= 0)
+			ys = yp - 1;
+		else if ((d - s - a2) > 0) {
+			if ((2 * d - s - a2) >= 0)
+				ys = yp + 1;
+			else {
+				ys = yp;
+				yp++;
+				d -= s + a2;
+				s += ds;
+			}
+		} else {
+			yp++;
+			ys = yp + 1;
+			d -= s + a2;
+			s += ds;
+		}
+
+		t -= dt;
+
+		/* Calculate alpha */
+		if (s != 0) {
+			cp = (float) abs(d) / (float) abs(s);
+			if (cp > 1.0) {
+				cp = 1.0;
+			}
+		} else {
+			cp = 1.0;
+		}
+
+		/* Calculate weights */
+		weight = (Uint8) (cp * 255);
+		iweight = 255 - weight;
+
+		/* Upper half */
+		xx = xc2 - xp;
+		result |= pixelRGBAWeight(renderer, xp, yp, r, g, b, a, iweight);
+		result |= pixelRGBAWeight(renderer, xx, yp, r, g, b, a, iweight);
+
+		result |= pixelRGBAWeight(renderer, xp, ys, r, g, b, a, weight);
+		result |= pixelRGBAWeight(renderer, xx, ys, r, g, b, a, weight);
+
+		/* Lower half */
+		yy = yc2 - yp;
+		result |= pixelRGBAWeight(renderer, xp, yy, r, g, b, a, iweight);
+		result |= pixelRGBAWeight(renderer, xx, yy, r, g, b, a, iweight);
+
+		yy = yc2 - ys;
+		result |= pixelRGBAWeight(renderer, xp, yy, r, g, b, a, weight);
+		result |= pixelRGBAWeight(renderer, xx, yy, r, g, b, a, weight);
+	}
+
+	/* Replaces original approximation code dyt = abs(yp - yc); */
+	dyt = (Sint16)lrint((double)b2 / sab ) + od;    
+
+	for (i = 1; i <= dyt; i++) {
+		yp++;
+		d -= s + a2;
+
+		if (d <= 0)
+			xs = xp + 1;
+		else if ((d + t - b2) < 0) {
+			if ((2 * d + t - b2) <= 0)
+				xs = xp - 1;
+			else {
+				xs = xp;
+				xp--;
+				d += t - b2;
+				t -= dt;
+			}
+		} else {
+			xp--;
+			xs = xp - 1;
+			d += t - b2;
+			t -= dt;
+		}
+
+		s += ds;
+
+		/* Calculate alpha */
+		if (t != 0) {
+			cp = (float) abs(d) / (float) abs(t);
+			if (cp > 1.0) {
+				cp = 1.0;
+			}
+		} else {
+			cp = 1.0;
+		}
+
+		/* Calculate weight */
+		weight = (Uint8) (cp * 255);
+		iweight = 255 - weight;
+
+		/* Left half */
+		xx = xc2 - xp;
+		yy = yc2 - yp;
+		result |= pixelRGBAWeight(renderer, xp, yp, r, g, b, a, iweight);
+		result |= pixelRGBAWeight(renderer, xx, yp, r, g, b, a, iweight);
+
+		result |= pixelRGBAWeight(renderer, xp, yy, r, g, b, a, iweight);
+		result |= pixelRGBAWeight(renderer, xx, yy, r, g, b, a, iweight);
+
+		/* Right half */
+		xx = xc2 - xs;
+		result |= pixelRGBAWeight(renderer, xs, yp, r, g, b, a, weight);
+		result |= pixelRGBAWeight(renderer, xx, yp, r, g, b, a, weight);
+
+		result |= pixelRGBAWeight(renderer, xs, yy, r, g, b, a, weight);
+		result |= pixelRGBAWeight(renderer, xx, yy, r, g, b, a, weight);		
+	}
+
+	return (result);
+}
+
+/* ---- Filled Ellipse */
+
+/*!
+\brief Draw filled ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled ellipse.
+\param y Y coordinate of the center of the filled ellipse.
+\param rx Horizontal radius in pixels of the filled ellipse.
+\param ry Vertical radius in pixels of the filled ellipse.
+\param color The color value of the filled ellipse to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledEllipseColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return _ellipseRGBA(renderer, x, y, rx, ry, c[0], c[1], c[2], c[3], 1);
+}
+
+/*!
+\brief Draw filled ellipse with blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled ellipse.
+\param y Y coordinate of the center of the filled ellipse.
+\param rx Horizontal radius in pixels of the filled ellipse.
+\param ry Vertical radius in pixels of the filled ellipse.
+\param r The red value of the filled ellipse to draw. 
+\param g The green value of the filled ellipse to draw. 
+\param b The blue value of the filled ellipse to draw. 
+\param a The alpha value of the filled ellipse to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledEllipseRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _ellipseRGBA(renderer, x, y, rx, ry, r, g, b, a, 1);
+}
+
+/* ----- Pie */
+
+/*!
+\brief Internal float (low-speed) pie-calc implementation by drawing polygons.
+
+Note: Determines vertex array and uses polygon or filledPolygon drawing routines to render.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the pie.
+\param y Y coordinate of the center of the pie.
+\param rad Radius in pixels of the pie.
+\param start Starting radius in degrees of the pie.
+\param end Ending radius in degrees of the pie.
+\param r The red value of the pie to draw. 
+\param g The green value of the pie to draw. 
+\param b The blue value of the pie to draw. 
+\param a The alpha value of the pie to draw.
+\param filled Flag indicating if the pie should be filled (=1) or not (=0).
+
+\returns Returns 0 on success, -1 on failure.
+*/
+/* TODO: rewrite algorithm; pie is not always accurate */
+int _pieRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end,  Uint8 r, Uint8 g, Uint8 b, Uint8 a, Uint8 filled)
+{
+	int result;
+	double angle, start_angle, end_angle;
+	double deltaAngle;
+	double dr;
+	int numpoints, i;
+	Sint16 *vx, *vy;
+
+	/*
+	* Sanity check radii 
+	*/
+	if (rad < 0) {
+		return (-1);
+	}
+
+	/*
+	* Fixup angles
+	*/
+	start = start % 360;
+	end = end % 360;
+
+	/*
+	* Special case for rad=0 - draw a point 
+	*/
+	if (rad == 0) {
+		return (pixelRGBA(renderer, x, y, r, g, b, a));
+	}
+
+	/*
+	* Variable setup 
+	*/
+	dr = (double) rad;
+	deltaAngle = 3.0 / dr;
+	start_angle = (double) start *(2.0 * M_PI / 360.0);
+	end_angle = (double) end *(2.0 * M_PI / 360.0);
+	if (start > end) {
+		end_angle += (2.0 * M_PI);
+	}
+
+	/* We will always have at least 2 points */
+	numpoints = 2;
+
+	/* Count points (rather than calculating it) */
+	angle = start_angle;
+	while (angle < end_angle) {
+		angle += deltaAngle;
+		numpoints++;
+	}
+
+	/* Allocate combined vertex array */
+	vx = vy = (Sint16 *) malloc(2 * sizeof(Uint16) * numpoints);
+	if (vx == NULL) {
+		return (-1);
+	}
+
+	/* Update point to start of vy */
+	vy += numpoints;
+
+	/* Center */
+	vx[0] = x;
+	vy[0] = y;
+
+	/* First vertex */
+	angle = start_angle;
+	vx[1] = x + (int) (dr * cos(angle));
+	vy[1] = y + (int) (dr * sin(angle));
+
+	if (numpoints<3)
+	{
+		result = lineRGBA(renderer, vx[0], vy[0], vx[1], vy[1], r, g, b, a);
+	}
+	else
+	{
+		/* Calculate other vertices */
+		i = 2;
+		angle = start_angle;
+		while (angle < end_angle) {
+			angle += deltaAngle;
+			if (angle>end_angle)
+			{
+				angle = end_angle;
+			}
+			vx[i] = x + (int) (dr * cos(angle));
+			vy[i] = y + (int) (dr * sin(angle));
+			i++;
+		}
+
+		/* Draw */
+		if (filled) {
+			result = filledPolygonRGBA(renderer, vx, vy, numpoints, r, g, b, a);
+		} else {
+			result = polygonRGBA(renderer, vx, vy, numpoints, r, g, b, a);
+		}
+	}
+
+	/* Free combined vertex array */
+	free(vx);
+
+	return (result);
+}
+
+/*!
+\brief Draw pie (outline) with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the pie.
+\param y Y coordinate of the center of the pie.
+\param rad Radius in pixels of the pie.
+\param start Starting radius in degrees of the pie.
+\param end Ending radius in degrees of the pie.
+\param color The color value of the pie to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pieColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, 
+	Sint16 start, Sint16 end, Uint32 color) 
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return _pieRGBA(renderer, x, y, rad, start, end, c[0], c[1], c[2], c[3], 0);
+}
+
+/*!
+\brief Draw pie (outline) with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the pie.
+\param y Y coordinate of the center of the pie.
+\param rad Radius in pixels of the pie.
+\param start Starting radius in degrees of the pie.
+\param end Ending radius in degrees of the pie.
+\param r The red value of the pie to draw. 
+\param g The green value of the pie to draw. 
+\param b The blue value of the pie to draw. 
+\param a The alpha value of the pie to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pieRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+	Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _pieRGBA(renderer, x, y, rad, start, end, r, g, b, a, 0);
+}
+
+/*!
+\brief Draw filled pie with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled pie.
+\param y Y coordinate of the center of the filled pie.
+\param rad Radius in pixels of the filled pie.
+\param start Starting radius in degrees of the filled pie.
+\param end Ending radius in degrees of the filled pie.
+\param color The color value of the filled pie to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPieColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return _pieRGBA(renderer, x, y, rad, start, end, c[0], c[1], c[2], c[3], 1);
+}
+
+/*!
+\brief Draw filled pie with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x X coordinate of the center of the filled pie.
+\param y Y coordinate of the center of the filled pie.
+\param rad Radius in pixels of the filled pie.
+\param start Starting radius in degrees of the filled pie.
+\param end Ending radius in degrees of the filled pie.
+\param r The red value of the filled pie to draw. 
+\param g The green value of the filled pie to draw. 
+\param b The blue value of the filled pie to draw. 
+\param a The alpha value of the filled pie to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPieRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, Sint16 rad,
+	Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return _pieRGBA(renderer, x, y, rad, start, end, r, g, b, a, 1);
+}
+
+/* ------ Trigon */
+
+/*!
+\brief Draw trigon (triangle outline) with alpha blending.
+
+Note: Creates vertex array and uses polygon routine to render.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the trigon.
+\param y1 Y coordinate of the first point of the trigon.
+\param x2 X coordinate of the second point of the trigon.
+\param y2 Y coordinate of the second point of the trigon.
+\param x3 X coordinate of the third point of the trigon.
+\param y3 Y coordinate of the third point of the trigon.
+\param color The color value of the trigon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int trigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(polygonColor(renderer,vx,vy,3,color));
+}
+
+/*!
+\brief Draw trigon (triangle outline) with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the trigon.
+\param y1 Y coordinate of the first point of the trigon.
+\param x2 X coordinate of the second point of the trigon.
+\param y2 Y coordinate of the second point of the trigon.
+\param x3 X coordinate of the third point of the trigon.
+\param y3 Y coordinate of the third point of the trigon.
+\param r The red value of the trigon to draw. 
+\param g The green value of the trigon to draw. 
+\param b The blue value of the trigon to draw. 
+\param a The alpha value of the trigon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int trigonRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+	Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(polygonRGBA(renderer,vx,vy,3,r,g,b,a));
+}				 
+
+/* ------ AA-Trigon */
+
+/*!
+\brief Draw anti-aliased trigon (triangle outline) with alpha blending.
+
+Note: Creates vertex array and uses aapolygon routine to render.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the aa-trigon.
+\param y1 Y coordinate of the first point of the aa-trigon.
+\param x2 X coordinate of the second point of the aa-trigon.
+\param y2 Y coordinate of the second point of the aa-trigon.
+\param x3 X coordinate of the third point of the aa-trigon.
+\param y3 Y coordinate of the third point of the aa-trigon.
+\param color The color value of the aa-trigon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aatrigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(aapolygonColor(renderer,vx,vy,3,color));
+}
+
+/*!
+\brief Draw anti-aliased trigon (triangle outline) with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the aa-trigon.
+\param y1 Y coordinate of the first point of the aa-trigon.
+\param x2 X coordinate of the second point of the aa-trigon.
+\param y2 Y coordinate of the second point of the aa-trigon.
+\param x3 X coordinate of the third point of the aa-trigon.
+\param y3 Y coordinate of the third point of the aa-trigon.
+\param r The red value of the aa-trigon to draw. 
+\param g The green value of the aa-trigon to draw. 
+\param b The blue value of the aa-trigon to draw. 
+\param a The alpha value of the aa-trigon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aatrigonRGBA(SDL_Renderer * renderer,  Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+	Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(aapolygonRGBA(renderer,vx,vy,3,r,g,b,a));
+}				   
+
+/* ------ Filled Trigon */
+
+/*!
+\brief Draw filled trigon (triangle) with alpha blending.
+
+Note: Creates vertex array and uses aapolygon routine to render.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the filled trigon.
+\param y1 Y coordinate of the first point of the filled trigon.
+\param x2 X coordinate of the second point of the filled trigon.
+\param y2 Y coordinate of the second point of the filled trigon.
+\param x3 X coordinate of the third point of the filled trigon.
+\param y3 Y coordinate of the third point of the filled trigon.
+\param color The color value of the filled trigon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledTrigonColor(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(filledPolygonColor(renderer,vx,vy,3,color));
+}
+
+/*!
+\brief Draw filled trigon (triangle) with alpha blending.
+
+Note: Creates vertex array and uses aapolygon routine to render.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the filled trigon.
+\param y1 Y coordinate of the first point of the filled trigon.
+\param x2 X coordinate of the second point of the filled trigon.
+\param y2 Y coordinate of the second point of the filled trigon.
+\param x3 X coordinate of the third point of the filled trigon.
+\param y3 Y coordinate of the third point of the filled trigon.
+\param r The red value of the filled trigon to draw. 
+\param g The green value of the filled trigon to draw. 
+\param b The blue value of the filled trigon to draw. 
+\param a The alpha value of the filled trigon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledTrigonRGBA(SDL_Renderer * renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+	Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(filledPolygonRGBA(renderer,vx,vy,3,r,g,b,a));
+}
+
+/* ---- Polygon */
+
+/*!
+\brief Draw polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the polygon.
+\param vy Vertex array containing Y coordinates of the points of the polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param color The color value of the polygon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int polygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return polygonRGBA(renderer, vx, vy, n, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw polygon with the currently set color and blend mode.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the polygon.
+\param vy Vertex array containing Y coordinates of the points of the polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int polygon(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n)
+{
+	/*
+	* Draw 
+	*/
+	int result = 0;
+	int i, nn;
+	SDL_Point* points;
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (-1);
+	}
+	if (vy == NULL) {
+		return (-1);
+	}
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (-1);
+	}
+
+	/*
+	* Create array of points
+	*/
+	nn = n + 1;
+	points = (SDL_Point*)malloc(sizeof(SDL_Point) * nn);
+	if (points == NULL)
+	{
+		return -1;
+	}
+	for (i=0; i<n; i++)
+	{
+		points[i].x = vx[i];
+		points[i].y = vy[i];
+	}
+	points[n].x = vx[0];
+	points[n].y = vy[0];
+
+	/*
+	* Draw 
+	*/
+	result |= SDL_RenderDrawLines(renderer, points, nn);
+	free(points);
+
+	return (result);
+}
+
+/*!
+\brief Draw polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the polygon.
+\param vy Vertex array containing Y coordinates of the points of the polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the polygon to draw. 
+\param g The green value of the polygon to draw. 
+\param b The blue value of the polygon to draw. 
+\param a The alpha value of the polygon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int polygonRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	int result;
+	const Sint16 *x1, *y1, *x2, *y2;
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (-1);
+	}
+	if (vy == NULL) {
+		return (-1);
+	}
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (-1);
+	}
+
+	/*
+	* Pointer setup 
+	*/
+	x1 = x2 = vx;
+	y1 = y2 = vy;
+	x2++;
+	y2++;
+
+	/*
+	* Set color 
+	*/
+	result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);	
+
+	/*
+	* Draw 
+	*/
+	result |= polygon(renderer, vx, vy, n);
+
+	return (result);
+}
+
+/* ---- AA-Polygon */
+
+/*!
+\brief Draw anti-aliased polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the aa-polygon.
+\param vy Vertex array containing Y coordinates of the points of the aa-polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param color The color value of the aa-polygon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aapolygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return aapolygonRGBA(renderer, vx, vy, n, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw anti-aliased polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the aa-polygon.
+\param vy Vertex array containing Y coordinates of the points of the aa-polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the aa-polygon to draw. 
+\param g The green value of the aa-polygon to draw. 
+\param b The blue value of the aa-polygon to draw. 
+\param a The alpha value of the aa-polygon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aapolygonRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result;
+	int i;
+	const Sint16 *x1, *y1, *x2, *y2;
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (-1);
+	}
+	if (vy == NULL) {
+		return (-1);
+	}
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (-1);
+	}
+
+	/*
+	* Pointer setup 
+	*/
+	x1 = x2 = vx;
+	y1 = y2 = vy;
+	x2++;
+	y2++;
+
+	/*
+	* Draw 
+	*/
+	result = 0;
+	for (i = 1; i < n; i++) {
+		result |= _aalineRGBA(renderer, *x1, *y1, *x2, *y2, r, g, b, a, 0);
+		x1 = x2;
+		y1 = y2;
+		x2++;
+		y2++;
+	}
+
+	result |= _aalineRGBA(renderer, *x1, *y1, *vx, *vy, r, g, b, a, 0);
+
+	return (result);
+}
+
+/* ---- Filled Polygon */
+
+/*!
+\brief Internal helper qsort callback functions used in filled polygon drawing.
+
+\param a The surface to draw on.
+\param b Vertex array containing X coordinates of the points of the polygon.
+
+\returns Returns 0 if a==b, a negative number if a<b or a positive number if a>b.
+*/
+int _gfxPrimitivesCompareInt(const void *a, const void *b)
+{
+	return (*(const int *) a) - (*(const int *) b);
+}
+
+/*!
+\brief Global vertex array to use if optional parameters are not given in filledPolygonMT calls.
+
+Note: Used for non-multithreaded (default) operation of filledPolygonMT.
+*/
+static int *gfxPrimitivesPolyIntsGlobal = NULL;
+
+/*!
+\brief Flag indicating if global vertex array was already allocated.
+
+Note: Used for non-multithreaded (default) operation of filledPolygonMT.
+*/
+static int gfxPrimitivesPolyAllocatedGlobal = 0;
+
+/*!
+\brief Draw filled polygon with alpha blending (multi-threaded capable).
+
+Note: The last two parameters are optional; but are required for multithreaded operation.  
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the filled polygon.
+\param vy Vertex array containing Y coordinates of the points of the filled polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the filled polygon to draw. 
+\param g The green value of the filled polygon to draw. 
+\param b The blue value of the filled polygon to draw. 
+\param a The alpha value of the filled polygon to draw.
+\param polyInts Preallocated, temporary vertex array used for sorting vertices. Required for multithreaded operation; set to NULL otherwise.
+\param polyAllocated Flag indicating if temporary vertex array was allocated. Required for multithreaded operation; set to NULL otherwise.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPolygonRGBAMT(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a, int **polyInts, int *polyAllocated)
+{
+	int result;
+	int i;
+	int y, xa, xb;
+	int miny, maxy;
+	int x1, y1;
+	int x2, y2;
+	int ind1, ind2;
+	int ints;
+	int *gfxPrimitivesPolyInts = NULL;
+	int *gfxPrimitivesPolyIntsNew = NULL;
+	int gfxPrimitivesPolyAllocated = 0;
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (-1);
+	}
+	if (vy == NULL) {
+		return (-1);
+	}
+
+	/*
+	* Sanity check number of edges
+	*/
+	if (n < 3) {
+		return -1;
+	}
+
+	/*
+	* Map polygon cache  
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) {
+		/* Use global cache */
+		gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsGlobal;
+		gfxPrimitivesPolyAllocated = gfxPrimitivesPolyAllocatedGlobal;
+	} else {
+		/* Use local cache */
+		gfxPrimitivesPolyInts = *polyInts;
+		gfxPrimitivesPolyAllocated = *polyAllocated;
+	}
+
+	/*
+	* Allocate temp array, only grow array 
+	*/
+	if (!gfxPrimitivesPolyAllocated) {
+		gfxPrimitivesPolyInts = (int *) malloc(sizeof(int) * n);
+		gfxPrimitivesPolyAllocated = n;
+	} else {
+		if (gfxPrimitivesPolyAllocated < n) {
+			gfxPrimitivesPolyIntsNew = (int *) realloc(gfxPrimitivesPolyInts, sizeof(int) * n);
+			if (!gfxPrimitivesPolyIntsNew) {
+				if (!gfxPrimitivesPolyInts) {
+					free(gfxPrimitivesPolyInts);
+					gfxPrimitivesPolyInts = NULL;
+				}
+				gfxPrimitivesPolyAllocated = 0;
+			} else {
+				gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsNew;
+				gfxPrimitivesPolyAllocated = n;
+			}
+		}
+	}
+
+	/*
+	* Check temp array
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		gfxPrimitivesPolyAllocated = 0;
+	}
+
+	/*
+	* Update cache variables
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) { 
+		gfxPrimitivesPolyIntsGlobal =  gfxPrimitivesPolyInts;
+		gfxPrimitivesPolyAllocatedGlobal = gfxPrimitivesPolyAllocated;
+	} else {
+		*polyInts = gfxPrimitivesPolyInts;
+		*polyAllocated = gfxPrimitivesPolyAllocated;
+	}
+
+	/*
+	* Check temp array again
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		return(-1);
+	}
+
+	/*
+	* Determine Y maxima 
+	*/
+	miny = vy[0];
+	maxy = vy[0];
+	for (i = 1; (i < n); i++) {
+		if (vy[i] < miny) {
+			miny = vy[i];
+		} else if (vy[i] > maxy) {
+			maxy = vy[i];
+		}
+	}
+
+	/*
+	* Draw, scanning y 
+	*/
+	result = 0;
+	for (y = miny; (y <= maxy); y++) {
+		ints = 0;
+		for (i = 0; (i < n); i++) {
+			if (!i) {
+				ind1 = n - 1;
+				ind2 = 0;
+			} else {
+				ind1 = i - 1;
+				ind2 = i;
+			}
+			y1 = vy[ind1];
+			y2 = vy[ind2];
+			if (y1 < y2) {
+				x1 = vx[ind1];
+				x2 = vx[ind2];
+			} else if (y1 > y2) {
+				y2 = vy[ind1];
+				y1 = vy[ind2];
+				x2 = vx[ind1];
+				x1 = vx[ind2];
+			} else {
+				continue;
+			}
+			if ( ((y >= y1) && (y < y2)) || ((y == maxy) && (y > y1) && (y <= y2)) ) {
+				gfxPrimitivesPolyInts[ints++] = ((65536 * (y - y1)) / (y2 - y1)) * (x2 - x1) + (65536 * x1);
+			} 	    
+		}
+
+		qsort(gfxPrimitivesPolyInts, ints, sizeof(int), _gfxPrimitivesCompareInt);
+
+		/*
+		* Set color 
+		*/
+		result = 0;
+	    result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+		result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);	
+
+		for (i = 0; (i < ints); i += 2) {
+			xa = gfxPrimitivesPolyInts[i] + 1;
+			xa = (xa >> 16) + ((xa & 32768) >> 15);
+			xb = gfxPrimitivesPolyInts[i+1] - 1;
+			xb = (xb >> 16) + ((xb & 32768) >> 15);
+			result |= hline(renderer, xa, xb, y);
+		}
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw filled polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the filled polygon.
+\param vy Vertex array containing Y coordinates of the points of the filled polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param color The color value of the filled polygon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPolygonColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return filledPolygonRGBAMT(renderer, vx, vy, n, c[0], c[1], c[2], c[3], NULL, NULL);
+}
+
+/*!
+\brief Draw filled polygon with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the filled polygon.
+\param vy Vertex array containing Y coordinates of the points of the filled polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the filled polygon to draw. 
+\param g The green value of the filled polygon to draw. 
+\param b The blue value of the filed polygon to draw. 
+\param a The alpha value of the filled polygon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPolygonRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return filledPolygonRGBAMT(renderer, vx, vy, n, r, g, b, a, NULL, NULL);
+}
+
+/* ---- Textured Polygon */
+
+/*!
+\brief Internal function to draw a textured horizontal line.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param texture The texture to retrieve color information from.
+\param texture_w The width of the texture.
+\param texture_h The height of the texture.
+\param texture_dx The X offset for the texture lookup.
+\param texture_dy The Y offset for the textured lookup.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _HLineTextured(SDL_Renderer *renderer, Sint16 x1, Sint16 x2, Sint16 y, SDL_Texture *texture, int texture_w, int texture_h, int texture_dx, int texture_dy)
+{
+	Sint16 w;
+	Sint16 xtmp;
+	int result = 0;
+	int texture_x_walker;    
+	int texture_y_start;    
+	SDL_Rect source_rect,dst_rect;
+	int pixels_written,write_width;
+
+	/*
+	* Swap x1, x2 if required to ensure x1<=x2
+	*/
+	if (x1 > x2) {
+		xtmp = x1;
+		x1 = x2;
+		x2 = xtmp;
+	}
+
+	/*
+	* Calculate width to draw
+	*/
+	w = x2 - x1 + 1;
+
+	/*
+	* Determine where in the texture we start drawing
+	*/
+	texture_x_walker =   (x1 - texture_dx)  % texture_w;
+	if (texture_x_walker < 0){
+		texture_x_walker = texture_w + texture_x_walker ;
+	}
+
+	texture_y_start = (y + texture_dy) % texture_h;
+	if (texture_y_start < 0){
+		texture_y_start = texture_h + texture_y_start;
+	}
+
+	/* setup the source rectangle; we are only drawing one horizontal line */
+	source_rect.y = texture_y_start;
+	source_rect.x = texture_x_walker;
+	source_rect.h = 1;
+
+	/* we will draw to the current y */
+	dst_rect.y = y;
+	dst_rect.h = 1;
+
+	/* if there are enough pixels left in the current row of the texture */
+	/* draw it all at once */
+	if (w <= texture_w -texture_x_walker){
+		source_rect.w = w;
+		source_rect.x = texture_x_walker;
+		dst_rect.x= x1;
+		dst_rect.w = source_rect.w;
+		result = (SDL_RenderCopy(renderer, texture, &source_rect, &dst_rect) == 0);
+	} else { 
+		/* we need to draw multiple times */
+		/* draw the first segment */
+		pixels_written = texture_w  - texture_x_walker;
+		source_rect.w = pixels_written;
+		source_rect.x = texture_x_walker;
+		dst_rect.x= x1;
+		dst_rect.w = source_rect.w;
+		result |= (SDL_RenderCopy(renderer, texture, &source_rect, &dst_rect) == 0);
+		write_width = texture_w;
+
+		/* now draw the rest */
+		/* set the source x to 0 */
+		source_rect.x = 0;
+		while (pixels_written < w){
+			if (write_width >= w - pixels_written) {
+				write_width =  w - pixels_written;
+			}
+			source_rect.w = write_width;
+			dst_rect.x = x1 + pixels_written;
+			dst_rect.w = source_rect.w;
+			result |= (SDL_RenderCopy(renderer, texture, &source_rect, &dst_rect) == 0);
+			pixels_written += write_width;
+		}
+	}
+
+	return result;
+}
+
+/*!
+\brief Draws a polygon filled with the given texture (Multi-Threading Capable). 
+
+\param renderer The renderer to draw on.
+\param vx array of x vector components
+\param vy array of x vector components
+\param n the amount of vectors in the vx and vy array
+\param texture the sdl surface to use to fill the polygon
+\param texture_dx the offset of the texture relative to the screeen. If you move the polygon 10 pixels 
+to the left and want the texture to apear the same you need to increase the texture_dx value
+\param texture_dy see texture_dx
+\param polyInts Preallocated temp array storage for vertex sorting (used for multi-threaded operation)
+\param polyAllocated Flag indicating oif the temp array was allocated (used for multi-threaded operation)
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int texturedPolygonMT(SDL_Renderer *renderer, const Sint16 * vx, const Sint16 * vy, int n, 
+	SDL_Surface * texture, int texture_dx, int texture_dy, int **polyInts, int *polyAllocated)
+{
+	int result;
+	int i;
+	int y, xa, xb;
+	int minx,maxx,miny, maxy;
+	int x1, y1;
+	int x2, y2;
+	int ind1, ind2;
+	int ints;
+	int *gfxPrimitivesPolyInts = NULL;
+	int *gfxPrimitivesPolyIntsTemp = NULL;
+	int gfxPrimitivesPolyAllocated = 0;
+	SDL_Texture *textureAsTexture = NULL;
+
+	/*
+	* Sanity check number of edges
+	*/
+	if (n < 3) {
+		return -1;
+	}
+
+	/*
+	* Map polygon cache  
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) {
+		/* Use global cache */
+		gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsGlobal;
+		gfxPrimitivesPolyAllocated = gfxPrimitivesPolyAllocatedGlobal;
+	} else {
+		/* Use local cache */
+		gfxPrimitivesPolyInts = *polyInts;
+		gfxPrimitivesPolyAllocated = *polyAllocated;
+	}
+
+	/*
+	* Allocate temp array, only grow array 
+	*/
+	if (!gfxPrimitivesPolyAllocated) {
+		gfxPrimitivesPolyInts = (int *) malloc(sizeof(int) * n);
+		gfxPrimitivesPolyAllocated = n;
+	} else {
+		if (gfxPrimitivesPolyAllocated < n) {
+			gfxPrimitivesPolyIntsTemp = (int *) realloc(gfxPrimitivesPolyInts, sizeof(int) * n);
+			if (gfxPrimitivesPolyIntsTemp == NULL) {
+				/* Realloc failed - keeps original memory block, but fails this operation */
+				return(-1);
+			}
+			gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsTemp;
+			gfxPrimitivesPolyAllocated = n;
+		}
+	}
+
+	/*
+	* Check temp array
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		gfxPrimitivesPolyAllocated = 0;
+	}
+
+	/*
+	* Update cache variables
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) { 
+		gfxPrimitivesPolyIntsGlobal =  gfxPrimitivesPolyInts;
+		gfxPrimitivesPolyAllocatedGlobal = gfxPrimitivesPolyAllocated;
+	} else {
+		*polyInts = gfxPrimitivesPolyInts;
+		*polyAllocated = gfxPrimitivesPolyAllocated;
+	}
+
+	/*
+	* Check temp array again
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		return(-1);
+	}
+
+	/*
+	* Determine X,Y minima,maxima 
+	*/
+	miny = vy[0];
+	maxy = vy[0];
+	minx = vx[0];
+	maxx = vx[0];
+	for (i = 1; (i < n); i++) {
+		if (vy[i] < miny) {
+			miny = vy[i];
+		} else if (vy[i] > maxy) {
+			maxy = vy[i];
+		}
+		if (vx[i] < minx) {
+			minx = vx[i];
+		} else if (vx[i] > maxx) {
+			maxx = vx[i];
+		}
+	}
+
+    /* Create texture for drawing */
+	textureAsTexture = SDL_CreateTextureFromSurface(renderer, texture);
+	if (textureAsTexture == NULL)
+	{
+		return -1;
+	}
+	SDL_SetTextureBlendMode(textureAsTexture, SDL_BLENDMODE_BLEND);
+	
+	/*
+	* Draw, scanning y 
+	*/
+	result = 0;
+	for (y = miny; (y <= maxy); y++) {
+		ints = 0;
+		for (i = 0; (i < n); i++) {
+			if (!i) {
+				ind1 = n - 1;
+				ind2 = 0;
+			} else {
+				ind1 = i - 1;
+				ind2 = i;
+			}
+			y1 = vy[ind1];
+			y2 = vy[ind2];
+			if (y1 < y2) {
+				x1 = vx[ind1];
+				x2 = vx[ind2];
+			} else if (y1 > y2) {
+				y2 = vy[ind1];
+				y1 = vy[ind2];
+				x2 = vx[ind1];
+				x1 = vx[ind2];
+			} else {
+				continue;
+			}
+			if ( ((y >= y1) && (y < y2)) || ((y == maxy) && (y > y1) && (y <= y2)) ) {
+				gfxPrimitivesPolyInts[ints++] = ((65536 * (y - y1)) / (y2 - y1)) * (x2 - x1) + (65536 * x1);
+			} 
+		}
+
+		qsort(gfxPrimitivesPolyInts, ints, sizeof(int), _gfxPrimitivesCompareInt);
+
+		for (i = 0; (i < ints); i += 2) {
+			xa = gfxPrimitivesPolyInts[i] + 1;
+			xa = (xa >> 16) + ((xa & 32768) >> 15);
+			xb = gfxPrimitivesPolyInts[i+1] - 1;
+			xb = (xb >> 16) + ((xb & 32768) >> 15);
+			result |= _HLineTextured(renderer, xa, xb, y, textureAsTexture, texture->w, texture->h, texture_dx, texture_dy);
+		}
+	}
+
+	SDL_RenderPresent(renderer);
+	SDL_DestroyTexture(textureAsTexture);
+
+	return (result);
+}
+
+/*!
+\brief Draws a polygon filled with the given texture. 
+
+This standard version is calling multithreaded versions with NULL cache parameters.
+
+\param renderer The renderer to draw on.
+\param vx array of x vector components
+\param vy array of x vector components
+\param n the amount of vectors in the vx and vy array
+\param texture the sdl surface to use to fill the polygon
+\param texture_dx the offset of the texture relative to the screeen. if you move the polygon 10 pixels 
+to the left and want the texture to apear the same you need to increase the texture_dx value
+\param texture_dy see texture_dx
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int texturedPolygon(SDL_Renderer *renderer, const Sint16 * vx, const Sint16 * vy, int n, SDL_Surface *texture, int texture_dx, int texture_dy)
+{
+	/*
+	* Draw
+	*/
+	return (texturedPolygonMT(renderer, vx, vy, n, texture, texture_dx, texture_dy, NULL, NULL));
+}
+
+/* ---- Character */
+
+/*!
+\brief Global cache for NxM pixel font textures created at runtime.
+*/
+static SDL_Texture *gfxPrimitivesFont[256];
+
+/*!
+\brief Pointer to the current font data. Default is a 8x8 pixel internal font. 
+*/
+static const unsigned char *currentFontdata = gfxPrimitivesFontdata;
+
+/*!
+\brief Width of the current font. Default is 8. 
+*/
+static Uint32 charWidth = 8;
+
+/*!
+\brief Height of the current font. Default is 8. 
+*/
+static Uint32 charHeight = 8;
+
+/*!
+\brief Width for rendering. Autocalculated.
+*/
+static Uint32 charWidthLocal = 8;
+
+/*!
+\brief Height for rendering. Autocalculated.
+*/
+static Uint32 charHeightLocal = 8;
+
+/*!
+\brief Pitch of the current font in bytes. Default is 1. 
+*/
+static Uint32 charPitch = 1;
+
+/*!
+\brief Characters 90deg clockwise rotations. Default is 0. Max is 3. 
+*/
+static Uint32 charRotation = 0;
+
+/*!
+\brief Character data size in bytes of the current font. Default is 8. 
+*/
+static Uint32 charSize = 8;
+
+/*!
+\brief Sets or resets the current global font data.
+
+The font data array is organized in follows: 
+[fontdata] = [character 0][character 1]...[character 255] where
+[character n] = [byte 1 row 1][byte 2 row 1]...[byte {pitch} row 1][byte 1 row 2] ...[byte {pitch} row height] where
+[byte n] = [bit 0]...[bit 7] where 
+[bit n] = [0 for transparent pixel|1 for colored pixel]
+
+\param fontdata Pointer to array of font data. Set to NULL, to reset global font to the default 8x8 font.
+\param cw Width of character in bytes. Ignored if fontdata==NULL.
+\param ch Height of character in bytes. Ignored if fontdata==NULL.
+*/
+void gfxPrimitivesSetFont(const void *fontdata, Uint32 cw, Uint32 ch)
+{
+	int i;
+
+	if ((fontdata) && (cw) && (ch)) {
+		currentFontdata = (unsigned char *)fontdata;
+		charWidth = cw;
+		charHeight = ch;
+	} else {
+		currentFontdata = gfxPrimitivesFontdata;
+		charWidth = 8;
+		charHeight = 8;
+	}
+
+	charPitch = (charWidth+7)/8;
+	charSize = charPitch * charHeight;
+
+	/* Maybe flip width/height for rendering */
+	if ((charRotation==1) || (charRotation==3))
+	{
+		charWidthLocal = charHeight;
+		charHeightLocal = charWidth;
+	}
+	else
+	{
+		charWidthLocal = charWidth;
+		charHeightLocal = charHeight;
+	}
+
+	/* Clear character cache */
+	for (i = 0; i < 256; i++) {
+		if (gfxPrimitivesFont[i]) {
+			SDL_DestroyTexture(gfxPrimitivesFont[i]);
+			gfxPrimitivesFont[i] = NULL;
+		}
+	}
+}
+
+/*!
+\brief Sets current global font character rotation steps. 
+
+Default is 0 (no rotation). 1 = 90deg clockwise. 2 = 180deg clockwise. 3 = 270deg clockwise.
+Changing the rotation, will reset the character cache.
+
+\param rotation Number of 90deg clockwise steps to rotate
+*/
+void gfxPrimitivesSetFontRotation(Uint32 rotation)
+{
+	int i;
+
+	rotation = rotation & 3;
+	if (charRotation != rotation)
+	{
+		/* Store rotation */
+		charRotation = rotation;
+
+		/* Maybe flip width/height for rendering */
+		if ((charRotation==1) || (charRotation==3))
+		{
+			charWidthLocal = charHeight;
+			charHeightLocal = charWidth;
+		}
+		else
+		{
+			charWidthLocal = charWidth;
+			charHeightLocal = charHeight;
+		}
+
+		/* Clear character cache */
+		for (i = 0; i < 256; i++) {
+			if (gfxPrimitivesFont[i]) {
+				SDL_DestroyTexture(gfxPrimitivesFont[i]);
+				gfxPrimitivesFont[i] = NULL;
+			}
+		}
+	}
+}
+
+/*!
+\brief Draw a character of the currently set font.
+
+\param renderer The Renderer to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the character.
+\param y Y (vertical) coordinate of the upper left corner of the character.
+\param c The character to draw.
+\param r The red value of the character to draw. 
+\param g The green value of the character to draw. 
+\param b The blue value of the character to draw. 
+\param a The alpha value of the character to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int characterRGBA(SDL_Renderer *renderer, Sint16 x, Sint16 y, char c, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	SDL_Rect srect;
+	SDL_Rect drect;
+	int result;
+	Uint32 ix, iy;
+	const unsigned char *charpos;
+	Uint8 *curpos;
+	Uint8 patt, mask;
+	Uint8 *linepos;
+	Uint32 pitch;
+	SDL_Surface *character;
+	SDL_Surface *rotatedCharacter;
+	Uint32 ci;
+
+	/*
+	* Setup source rectangle
+	*/
+	srect.x = 0;
+	srect.y = 0;
+	srect.w = charWidthLocal;
+	srect.h = charHeightLocal;
+
+	/*
+	* Setup destination rectangle
+	*/
+	drect.x = x;
+	drect.y = y;
+	drect.w = charWidthLocal;
+	drect.h = charHeightLocal;
+
+	/* Character index in cache */
+	ci = (unsigned char) c;
+
+	/*
+	* Create new charWidth x charHeight bitmap surface if not already present.
+	* Might get rotated later.
+	*/
+	if (gfxPrimitivesFont[ci] == NULL) {
+		/*
+		* Redraw character into surface
+		*/
+		character =	SDL_CreateRGBSurface(SDL_SWSURFACE,
+			charWidth, charHeight, 32,
+			0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF);
+		if (character == NULL) {
+			return (-1);
+		}
+
+		charpos = currentFontdata + ci * charSize;
+				linepos = (Uint8 *)character->pixels;
+		pitch = character->pitch;
+
+		/*
+		* Drawing loop 
+		*/
+		patt = 0;
+		for (iy = 0; iy < charHeight; iy++) {
+			mask = 0x00;
+			curpos = linepos;
+			for (ix = 0; ix < charWidth; ix++) {
+				if (!(mask >>= 1)) {
+					patt = *charpos++;
+					mask = 0x80;
+				}
+				if (patt & mask) {
+					*(Uint32 *)curpos = 0xffffffff;
+				} else {
+					*(Uint32 *)curpos = 0;
+				}
+				curpos += 4;
+			}
+			linepos += pitch;
+		}
+
+		/* Maybe rotate and replace cached image */
+		if (charRotation>0)
+		{
+			rotatedCharacter = rotateSurface90Degrees(character, charRotation);
+			SDL_FreeSurface(character);
+			character = rotatedCharacter;
+		}
+
+		/* Convert temp surface into texture */
+		gfxPrimitivesFont[ci] = SDL_CreateTextureFromSurface(renderer, character);
+		SDL_FreeSurface(character);
+
+		/*
+		* Check pointer 
+		*/
+		if (gfxPrimitivesFont[ci] == NULL) {
+			return (-1);
+		}
+	}
+
+	/*
+	* Set color 
+	*/
+	result = 0;
+	result |= SDL_SetTextureColorMod(gfxPrimitivesFont[ci], r, g, b);
+	result |= SDL_SetTextureAlphaMod(gfxPrimitivesFont[ci], a);
+
+	/*
+	* Draw texture onto destination 
+	*/
+	result |= SDL_RenderCopy(renderer, gfxPrimitivesFont[ci], &srect, &drect);
+
+	return (result);
+}
+
+
+/*!
+\brief Draw a character of the currently set font.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the character.
+\param y Y (vertical) coordinate of the upper left corner of the character.
+\param c The character to draw.
+\param color The color value of the character to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int characterColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, char c, Uint32 color)
+{
+	Uint8 *co = (Uint8 *)&color; 
+	return characterRGBA(renderer, x, y, c, co[0], co[1], co[2], co[3]);
+}
+
+
+/*!
+\brief Draw a string in the currently set font.
+
+The spacing between consequtive characters in the string is the fixed number of pixels 
+of the character width of the current global font.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the string.
+\param y Y (vertical) coordinate of the upper left corner of the string.
+\param s The string to draw.
+\param color The color value of the string to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int stringColor(SDL_Renderer * renderer, Sint16 x, Sint16 y, const char *s, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return stringRGBA(renderer, x, y, s, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw a string in the currently set font.
+
+\param renderer The renderer to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the string.
+\param y Y (vertical) coordinate of the upper left corner of the string.
+\param s The string to draw.
+\param r The red value of the string to draw. 
+\param g The green value of the string to draw. 
+\param b The blue value of the string to draw. 
+\param a The alpha value of the string to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int stringRGBA(SDL_Renderer * renderer, Sint16 x, Sint16 y, const char *s, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result = 0;
+	Sint16 curx = x;
+	Sint16 cury = y;
+	const char *curchar = s;
+
+	while (*curchar && !result) {
+		result |= characterRGBA(renderer, curx, cury, *curchar, r, g, b, a);
+		switch (charRotation)
+		{
+		case 0:
+			curx += charWidthLocal;
+			break;
+		case 2:
+			curx -= charWidthLocal;
+			break;
+		case 1:
+			cury += charHeightLocal;
+			break;
+		case 3:
+			cury -= charHeightLocal;
+			break;
+		}
+		curchar++;
+	}
+
+	return (result);
+}
+
+/* ---- Bezier curve */
+
+/*!
+\brief Internal function to calculate bezier interpolator of data array with ndata values at position 't'.
+
+\param data Array of values.
+\param ndata Size of array.
+\param t Position for which to calculate interpolated value. t should be between [0, ndata].
+
+\returns Interpolated value at position t, value[0] when t<0, value[n-1] when t>n.
+*/
+double _evaluateBezier (double *data, int ndata, double t) 
+{
+	double mu, result;
+	int n,k,kn,nn,nkn;
+	double blend,muk,munk;
+
+	/* Sanity check bounds */
+	if (t<0.0) {
+		return(data[0]);
+	}
+	if (t>=(double)ndata) {
+		return(data[ndata-1]);
+	}
+
+	/* Adjust t to the range 0.0 to 1.0 */ 
+	mu=t/(double)ndata;
+
+	/* Calculate interpolate */
+	n=ndata-1;
+	result=0.0;
+	muk = 1;
+	munk = pow(1-mu,(double)n);
+	for (k=0;k<=n;k++) {
+		nn = n;
+		kn = k;
+		nkn = n - k;
+		blend = muk * munk;
+		muk *= mu;
+		munk /= (1-mu);
+		while (nn >= 1) {
+			blend *= nn;
+			nn--;
+			if (kn > 1) {
+				blend /= (double)kn;
+				kn--;
+			}
+			if (nkn > 1) {
+				blend /= (double)nkn;
+				nkn--;
+			}
+		}
+		result += data[k] * blend;
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw a bezier curve with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the bezier curve.
+\param vy Vertex array containing Y coordinates of the points of the bezier curve.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param s Number of steps for the interpolation. Minimum number is 2.
+\param color The color value of the bezier curve to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int bezierColor(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, int s, Uint32 color)
+{
+	Uint8 *c = (Uint8 *)&color; 
+	return bezierRGBA(renderer, vx, vy, n, s, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw a bezier curve with alpha blending.
+
+\param renderer The renderer to draw on.
+\param vx Vertex array containing X coordinates of the points of the bezier curve.
+\param vy Vertex array containing Y coordinates of the points of the bezier curve.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param s Number of steps for the interpolation. Minimum number is 2.
+\param r The red value of the bezier curve to draw. 
+\param g The green value of the bezier curve to draw. 
+\param b The blue value of the bezier curve to draw. 
+\param a The alpha value of the bezier curve to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int bezierRGBA(SDL_Renderer * renderer, const Sint16 * vx, const Sint16 * vy, int n, int s, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int result;
+	int i;
+	double *x, *y, t, stepsize;
+	Sint16 x1, y1, x2, y2;
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (-1);
+	}
+	if (s < 2) {
+		return (-1);
+	}
+
+	/*
+	* Variable setup 
+	*/
+	stepsize=(double)1.0/(double)s;
+
+	/* Transfer vertices into float arrays */
+	if ((x=(double *)malloc(sizeof(double)*(n+1)))==NULL) {
+		return(-1);
+	}
+	if ((y=(double *)malloc(sizeof(double)*(n+1)))==NULL) {
+		free(x);
+		return(-1);
+	}    
+	for (i=0; i<n; i++) {
+		x[i]=(double)vx[i];
+		y[i]=(double)vy[i];
+	}      
+	x[n]=(double)vx[0];
+	y[n]=(double)vy[0];
+
+	/*
+	* Set color 
+	*/
+	result = 0;
+	result |= SDL_SetRenderDrawBlendMode(renderer, (a == 255) ? SDL_BLENDMODE_NONE : SDL_BLENDMODE_BLEND);
+	result |= SDL_SetRenderDrawColor(renderer, r, g, b, a);
+
+	/*
+	* Draw 
+	*/
+	t=0.0;
+	x1=(Sint16)lrint(_evaluateBezier(x,n+1,t));
+	y1=(Sint16)lrint(_evaluateBezier(y,n+1,t));
+	for (i = 0; i <= (n*s); i++) {
+		t += stepsize;
+		x2=(Sint16)_evaluateBezier(x,n,t);
+		y2=(Sint16)_evaluateBezier(y,n,t);
+		result |= line(renderer, x1, y1, x2, y2);
+		x1 = x2;
+		y1 = y2;
+	}
+
+	/* Clean up temporary array */
+	free(x);
+	free(y);
+
+	return (result);
+}
+
+
+/*!
+\brief Draw a thick line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+\param width Width of the line in pixels. Must be >0.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int thickLineColor(SDL_Renderer *renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 width, Uint32 color)
+{	
+	Uint8 *c = (Uint8 *)&color; 
+	return thickLineRGBA(renderer, x1, y1, x2, y2, width, c[0], c[1], c[2], c[3]);
+}
+
+/*!
+\brief Draw a thick line with alpha blending.
+
+\param renderer The renderer to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+\param width Width of the line in pixels. Must be >0.
+\param r The red value of the character to draw. 
+\param g The green value of the character to draw. 
+\param b The blue value of the character to draw. 
+\param a The alpha value of the character to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/	
+int thickLineRGBA(SDL_Renderer *renderer, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 width, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	int wh;
+	double dx, dy, dx1, dy1, dx2, dy2;
+	double l, wl2, nx, ny, ang, adj;
+	Sint16 px[4], py[4];
+
+	if (renderer == NULL) {
+		return -1;
+	}
+
+	if (width < 1) {
+		return -1;
+	}
+
+	/* Special case: thick "point" */
+	if ((x1 == x2) && (y1 == y2)) {
+		wh = width / 2;
+		return boxRGBA(renderer, x1 - wh, y1 - wh, x2 + width, y2 + width, r, g, b, a);		
+	}
+
+	/* Special case: width == 1 */
+	if (width == 1) {
+		return lineRGBA(renderer, x1, y1, x2, y2, r, g, b, a);		
+	}
+
+	/* Calculate offsets for sides */
+	dx = (double)(x2 - x1);
+	dy = (double)(y2 - y1);
+	l = SDL_sqrt(dx*dx + dy*dy);
+	ang = SDL_atan2(dx, dy);
+	adj = 0.1 + 0.9 * SDL_fabs(SDL_cos(2.0 * ang));
+	wl2 = ((double)width - adj)/(2.0 * l);
+	nx = dx * wl2;
+	ny = dy * wl2;
+
+	/* Build polygon */
+	dx1 = (double)x1;
+	dy1 = (double)y1;
+	dx2 = (double)x2;
+	dy2 = (double)y2;
+	px[0] = (Sint16)(dx1 + ny);
+	px[1] = (Sint16)(dx1 - ny);
+	px[2] = (Sint16)(dx2 - ny);
+	px[3] = (Sint16)(dx2 + ny);
+	py[0] = (Sint16)(dy1 - nx);
+	py[1] = (Sint16)(dy1 + nx);
+	py[2] = (Sint16)(dy2 + nx);
+	py[3] = (Sint16)(dy2 - nx);
+
+	/* Draw polygon */
+	return filledPolygonRGBA(renderer, px, py, 4, r, g, b, a);
+}
diff --git a/lib/sdl2_gfx/src/SDL2_imageFilter.c b/lib/sdl2_gfx/src/SDL2_imageFilter.c
new file mode 100644
index 0000000..3590459
--- /dev/null
+++ b/lib/sdl2_gfx/src/SDL2_imageFilter.c
@@ -0,0 +1,7371 @@
+/*
+
+SDL2_imageFilter.c: byte-image "filter" routines
+
+Copyright (C) 2012-2014  Andreas Schiffler
+Copyright (C) 2013  Sylvain Beucler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+   1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+   2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+   3. This notice may not be removed or altered from any source
+   distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+/*
+
+Note: Uses inline x86 MMX or ASM optimizations if available and enabled.
+
+Note: Most of the MMX code is based on published routines 
+by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 
+him for his work.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "SDL.h"
+
+/* Use GCC intrinsics if available: they support both i386 and x86_64,
+   provide ASM-grade performances, and lift the PUSHA/POPA issues. */
+#ifdef __GNUC__
+#  ifdef USE_MMX
+#    include <mmintrin.h>
+#  endif
+#  include <SDL_cpuinfo.h>
+#endif
+
+#include "SDL2_imageFilter.h"
+
+/*!
+\brief Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.). 
+*/
+#define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8)  | (((x) & 0x0000ff00) << 8)  | ((x) << 24))
+
+/* ------ Static variables ----- */
+
+/*! 
+\brief Static state which enables the use of the MMX routines. Enabled by default 
+*/
+static int SDL_imageFilterUseMMX = 1;
+
+/* Detect GCC */
+#if defined(__GNUC__)
+#define GCC__
+#endif
+
+/*!
+\brief MMX detection routine (with override flag). 
+
+\returns 1 of MMX was detected, 0 otherwise.
+*/
+int SDL_imageFilterMMXdetect(void)
+{
+	/* Check override flag */
+	if (SDL_imageFilterUseMMX == 0) {
+		return (0);
+	}
+
+    return SDL_HasMMX();
+}
+
+/*!
+\brief Disable MMX check for filter functions and and force to use non-MMX C based code.
+*/
+void SDL_imageFilterMMXoff()
+{
+	SDL_imageFilterUseMMX = 0;
+}
+
+/*!
+\brief Enable MMX check for filter functions and use MMX code if available.
+*/
+void SDL_imageFilterMMXon()
+{
+	SDL_imageFilterUseMMX = 1;
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Internal MMX Filter using Add: D = saturation255(S1 + S2) 
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1	/* load Src1 address into eax */
+			mov ebx, Src2	/* load Src2 address into ebx */
+			mov edi, Dest	/* load Dest address into edi */
+			mov ecx, SrcLength	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16	/* 16 byte alignment of the loop entry */
+L1010:
+		movq mm1, [eax]	/* load 8 bytes from Src1 into mm1 */
+		paddusb mm1, [ebx]	/* mm1=Src1+Src2 (add 8 bytes with saturation) */
+		movq [edi], mm1	/* store result in Dest */
+			add eax, 8	/* increase Src1, Src2 and Dest  */
+			add ebx, 8	/* register pointers by 8 */
+			add edi, 8
+			dec ecx	/* decrease loop counter */
+			jnz L1010	/* check loop termination, proceed if required */
+			emms /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_paddusb(*mSrc1, *mSrc2);	/* Src1+Src2 (add 8 bytes with saturation) */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Add: D = saturation255(S1 + S2) 
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* Use MMX assembly routine */
+		SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 + (int) *cursrc2;
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using Mean: D = S1/2 + S2/2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+\param Mask Mask array containing 8 bytes with 0x7F value.
+]
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
+						   unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{ 
+		pusha
+			mov edx, Mask /* load Mask address into edx */
+			movq mm0, [edx] /* load Mask into mm0 */
+		mov eax, Src1 /* load Src1 address into eax */
+			mov ebx, Src2 /* load Src2 address into ebx */
+			mov edi, Dest /* load Dest address into edi */
+			mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
+			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16	/* 16 byte alignment of the loop entry */
+L21011:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
+		/* --- Byte shift via Word shift --- */
+		psrlw mm1, 1 	/* shift 4 WORDS of mm1 1 bit to the right */
+			psrlw mm2, 1 	/* shift 4 WORDS of mm2 1 bit to the right */
+			pand mm1, mm0   // apply Mask to 8 BYTES of mm1 */
+			/* byte     0x0f, 0xdb, 0xc8 */
+			pand mm2, mm0   // apply Mask to 8 BYTES of mm2 */
+			/* byte     0x0f, 0xdb, 0xd0 */
+			paddusb mm1,  mm2 	/* mm1=mm1+mm2 (add 8 bytes with saturation) */
+			movq [edi],  mm1 	/* store result in Dest */
+			add eax,  8 	/* increase Src1, Src2 and Dest  */
+			add ebx,  8 	/* register pointers by 8 */
+			add edi,  8
+			dec ecx 	/* decrease loop counter */
+			jnz L21011	/* check loop termination, proceed if required */
+			emms	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1 = *mSrc1,
+		      mm2 = *mSrc2;
+		mm1 = _m_psrlwi(mm1, 1);	/* shift 4 WORDS of mm1 1 bit to the right */
+		mm2 = _m_psrlwi(mm2, 1);	/* shift 4 WORDS of mm2 1 bit to the right */
+		mm1 = _m_pand(mm1, *mMask);	/* apply Mask to 8 BYTES of mm1 */
+		mm2 = _m_pand(mm2, *mMask);	/* apply Mask to 8 BYTES of mm2 */
+		*mDest = _m_paddusb(mm1, mm2);	/* mm1+mm2 (add 8 bytes with saturation) */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Mean: D = S1/2 + S2/2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using Sub: D = saturation0(S1 - S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax,  Src1 	/* load Src1 address into eax */
+			mov ebx,  Src2 	/* load Src2 address into ebx */
+			mov edi,  Dest 	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16 /* 16 byte alignment of the loop entry */
+L1012:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
+		movq [edi],  mm1 	/* store result in Dest */
+			add eax, 8 	/* increase Src1, Src2 and Dest  */
+			add ebx, 8 	/* register pointers by 8 */
+			add edi, 8
+			dec ecx	/* decrease loop counter */
+			jnz L1012	/* check loop termination, proceed if required */
+			emms /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psubusb(*mSrc1, *mSrc2);	/* Src1-Src2 (sub 8 bytes with saturation) */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Sub: D = saturation0(S1 - S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 - (int) *cursrc2;
+		if (result < 0)
+			result = 0;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AbsDiff: D = | S1 - S2 |
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1  	/* load Src1 address into eax */
+			mov ebx, Src2 	/* load Src2 address into ebx */
+			mov edi, Dest 	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16	/* 16 byte alignment of the loop entry */
+L1013:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
+		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
+		psubusb mm2,  [eax] 	/* mm2=Src2-Src1 (sub 8 bytes with saturation) */
+		por mm1,  mm2 	/* combine both mm2 and mm1 results */
+			movq [edi],  mm1 	/* store result in Dest */
+			add eax, 8 	/* increase Src1, Src2 and Dest  */
+			add ebx, 8 	/* register pointers by 8 */
+			add edi, 8
+			dec ecx 	/* decrease loop counter */
+			jnz L1013    	/* check loop termination, proceed if required */
+			emms         /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1 = _m_psubusb(*mSrc2, *mSrc1);	/* Src1-Src2 (sub 8 bytes with saturation) */
+		__m64 mm2 = _m_psubusb(*mSrc1, *mSrc2);	/* Src2-Src1 (sub 8 bytes with saturation) */
+		*mDest = _m_por(mm1, mm2);		/* combine both mm2 and mm1 results */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AbsDiff: D = | S1 - S2 |
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = abs((int) *cursrc1 - (int) *cursrc2);
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using Mult: D = saturation255(S1 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   /* load Src1 address into eax */
+			mov ebx, Src2   /* load Src2 address into ebx */
+			mov edi, Dest   /* load Dest address into edi */
+			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
+			shr ecx, 3   /* counter/8 (MMX loads 8 bytes at a time) */
+			pxor mm0, mm0   /* zero mm0 register */
+			align 16      	/* 16 byte alignment of the loop entry */
+L1014:
+		movq mm1, [eax]   /* load 8 bytes from Src1 into mm1 */
+		movq mm3, [ebx]   /* load 8 bytes from Src2 into mm3 */
+		movq mm2, mm1   /* copy mm1 into mm2 */
+			movq mm4, mm3   /* copy mm3 into mm4  */
+			punpcklbw mm1, mm0   /* unpack low  bytes of Src1 into words */
+			punpckhbw mm2, mm0   /* unpack high bytes of Src1 into words */
+			punpcklbw mm3, mm0   /* unpack low  bytes of Src2 into words */
+			punpckhbw mm4, mm0   /* unpack high bytes of Src2 into words */
+			pmullw mm1, mm3   /* mul low  bytes of Src1 and Src2  */
+			pmullw mm2, mm4   /* mul high bytes of Src1 and Src2 */
+			/* Take abs value of the results (signed words) */
+			movq mm5, mm1   /* copy mm1 into mm5 */
+			movq mm6, mm2   /* copy mm2 into mm6 */
+			psraw mm5, 15   /* fill mm5 words with word sign bit */
+			psraw mm6, 15   /* fill mm6 words with word sign bit */
+			pxor mm1, mm5   /* take 1's compliment of only neg. words */
+			pxor mm2, mm6   /* take 1's compliment of only neg. words */
+			psubsw mm1, mm5   /* add 1 to only neg. words, W-(-1) or W-0 */
+			psubsw mm2, mm6   /* add 1 to only neg. words, W-(-1) or W-0 */
+			packuswb mm1, mm2   /* pack words back into bytes with saturation */
+			movq [edi], mm1   /* store result in Dest */
+			add eax, 8   /* increase Src1, Src2 and Dest  */
+			add ebx, 8   /* register pointers by 8 */
+			add edi, 8
+			dec ecx 	/* decrease loop counter */
+			jnz L1014	/* check loop termination, proceed if required */
+			emms /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 ASM with constraints: */
+	/* asm volatile ( */
+	/* 	"shr $3, %%ecx \n\t"	/\* counter/8 (MMX loads 8 bytes at a time) *\/ */
+	/* 	"pxor      %%mm0, %%mm0 \n\t"	/\* zero mm0 register *\/ */
+	/* 	".align 16       \n\t"	/\* 16 byte alignment of the loop entry *\/ */
+	/* 	"1: movq (%%eax), %%mm1 \n\t"     /\* load 8 bytes from Src1 into mm1 *\/ */
+	/* 	"movq    (%%ebx), %%mm3 \n\t"	/\* load 8 bytes from Src2 into mm3 *\/ */
+	/* 	"movq      %%mm1, %%mm2 \n\t"	/\* copy mm1 into mm2 *\/ */
+	/* 	"movq      %%mm3, %%mm4 \n\t"	/\* copy mm3 into mm4  *\/ */
+	/* 	"punpcklbw %%mm0, %%mm1 \n\t"	/\* unpack low  bytes of Src1 into words *\/ */
+	/* 	"punpckhbw %%mm0, %%mm2 \n\t"	/\* unpack high bytes of Src1 into words *\/ */
+	/* 	"punpcklbw %%mm0, %%mm3 \n\t"	/\* unpack low  bytes of Src2 into words *\/ */
+	/* 	"punpckhbw %%mm0, %%mm4 \n\t"	/\* unpack high bytes of Src2 into words *\/ */
+	/* 	"pmullw    %%mm3, %%mm1 \n\t"	/\* mul low  bytes of Src1 and Src2  *\/ */
+	/* 	"pmullw    %%mm4, %%mm2 \n\t"	/\* mul high bytes of Src1 and Src2 *\/ */
+	/* 	/\* Take abs value of the results (signed words) *\/ */
+	/* 	"movq      %%mm1, %%mm5 \n\t"	/\* copy mm1 into mm5 *\/ */
+	/* 	"movq      %%mm2, %%mm6 \n\t"	/\* copy mm2 into mm6 *\/ */
+	/* 	"psraw       $15, %%mm5 \n\t"	/\* fill mm5 words with word sign bit *\/ */
+	/* 	"psraw       $15, %%mm6 \n\t"	/\* fill mm6 words with word sign bit *\/ */
+	/* 	"pxor      %%mm5, %%mm1 \n\t"	/\* take 1's compliment of only neg. words *\/ */
+	/* 	"pxor      %%mm6, %%mm2 \n\t"	/\* take 1's compliment of only neg. words *\/ */
+	/* 	"psubsw    %%mm5, %%mm1 \n\t"	/\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
+	/* 	"psubsw    %%mm6, %%mm2 \n\t"	/\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
+	/* 	"packuswb  %%mm2, %%mm1 \n\t"	/\* pack words back into bytes with saturation *\/ */
+	/* 	"movq    %%mm1, (%%edi) \n\t"	/\* store result in Dest *\/ */
+	/* 	"add $8, %%eax \n\t"	/\* increase Src1, Src2 and Dest  *\/ */
+	/* 	"add $8, %%ebx \n\t"	/\* register pointers by 8 *\/ */
+	/* 	"add $8, %%edi \n\t" */
+	/* 	"dec %%ecx     \n\t"	/\* decrease loop counter *\/ */
+	/* 	"jnz 1b        \n\t"	/\* check loop termination, proceed if required *\/ */
+	/* 	"emms          \n\t"	/\* exit MMX state *\/ */
+	/* 	: "+a" (Src1),		/\* load Src1 address into rax, modified by the loop *\/ */
+	/* 	  "+b" (Src2),		/\* load Src2 address into rbx, modified by the loop *\/ */
+	/* 	  "+c" (SrcLength),	/\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
+	/* 	  "+D" (Dest)		/\* load Dest address into rdi, modified by the loop *\/ */
+	/* 	: */
+	/* 	: "memory",		/\* *Dest is modified *\/ */
+        /*           "mm0","mm1","mm2","mm3","mm4","mm5","mm6"	/\* registers modified *\/ */
+	/* ); */
+
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
+		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
+		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
+		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
+		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
+		mm5 = _m_psrawi(mm1, 15);		/* fill mm5 words with word sign bit */
+		mm6 = _m_psrawi(mm2, 15);		/* fill mm6 words with word sign bit */
+		mm1 = _m_pxor(mm1, mm5);		/* take 1's compliment of only neg. words */
+		mm2 = _m_pxor(mm2, mm6);		/* take 1's compliment of only neg. words */
+		mm1 = _m_psubsw(mm1, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		mm2 = _m_psubsw(mm2, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Mult: D = saturation255(S1 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+
+		/* NOTE: this is probably wrong - dunno what the MMX code does */
+
+		result = (int) *cursrc1 * (int) *cursrc2;
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal ASM Filter using MultNor: D = S1 * S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Src1   /* load Src1 address into edx */
+			mov esi, Src2   /* load Src2 address into esi */
+			mov edi, Dest   /* load Dest address into edi */
+			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
+			align 16 	/* 16 byte alignment of the loop entry */
+L10141:
+		mov al, [edx]   /* load a byte from Src1 */
+		mul [esi] 	/* mul with a byte from Src2 */
+		mov [edi], al   /* move a byte result to Dest */
+			inc edx 	/* increment Src1, Src2, Dest */
+			inc esi 		/* pointer registers by one */
+			inc edi
+			dec ecx	/* decrease loop counter */
+			jnz L10141  	/* check loop termination, proceed if required */
+			popa
+	}
+#else
+	/* Note: ~5% gain on i386, less efficient than C on x86_64 */
+	/* Also depends on whether this function is static (?!) */
+	asm volatile (
+		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
+#  if defined(i386)
+		"1:mov  (%%edx), %%al \n\t"      /* load a byte from Src1 */
+		"mulb (%%esi)       \n\t"	/* mul with a byte from Src2 */
+		"mov %%al, (%%edi)  \n\t"       /* move a byte result to Dest */
+		"inc %%edx \n\t"		/* increment Src1, Src2, Dest */
+		"inc %%esi \n\t"		/* pointer registers by one */
+		"inc %%edi \n\t"
+		"dec %%ecx      \n\t"	/* decrease loop counter */
+#  elif defined(__x86_64__)
+		"1:mov  (%%rdx), %%al \n\t"      /* load a byte from Src1 */
+		"mulb (%%rsi)       \n\t"	/* mul with a byte from Src2 */
+		"mov %%al, (%%rdi)  \n\t"       /* move a byte result to Dest */
+		"inc %%rdx \n\t"		/* increment Src1, Src2, Dest */
+		"inc %%rsi \n\t"		/* pointer registers by one */
+		"inc %%rdi \n\t"
+		"dec %%rcx      \n\t"	/* decrease loop counter */
+#  endif
+		"jnz 1b         \n\t"	/* check loop termination, proceed if required */
+		: "+d" (Src1),		/* load Src1 address into edx */
+		  "+S" (Src2),		/* load Src2 address into esi */
+		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
+		  "+D" (Dest)		/* load Dest address into edi */
+		:
+		: "memory", "rax"
+		);
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultNor: D = S1 * S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (SDL_imageFilterMMXdetect()) {
+		if (length > 0) {
+			/* ASM routine */
+			SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
+
+			/* Check for unaligned bytes */
+			if ((length & 7) > 0) {
+				/* Setup to process unaligned bytes */
+				istart = length & 0xfffffff8;
+				cursrc1 = &Src1[istart];
+				cursrc2 = &Src2[istart];
+				curdst = &Dest[istart];
+			} else {
+				/* No unaligned bytes - we are done */
+				return (0);
+			}
+		} else {
+			/* No bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = (int)*cursrc1 * (int)*cursrc2;  // (int) for efficiency
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using MultDivby2: D = saturation255(S1/2 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{ 
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			pxor mm0,  mm0 	/* zero mm0 register */
+			align 16          	/* 16 byte alignment of the loop entry */
+L1015:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		movq mm3,  [ebx] 	/* load 8 bytes from Src2 into mm3 */
+		movq mm2,  mm1 	/* copy mm1 into mm2 */
+			movq mm4,  mm3 	/* copy mm3 into mm4  */
+			punpcklbw mm1,  mm0 	/* unpack low  bytes of Src1 into words */
+			punpckhbw mm2,  mm0 	/* unpack high bytes of Src1 into words */
+			punpcklbw mm3,  mm0 	/* unpack low  bytes of Src2 into words */
+			punpckhbw mm4,  mm0 	/* unpack high bytes of Src2 into words */
+			psrlw mm1,  1 	/* divide mm1 words by 2, Src1 low bytes */
+			psrlw mm2,  1 	/* divide mm2 words by 2, Src1 high bytes */
+			pmullw mm1,  mm3 	/* mul low  bytes of Src1 and Src2  */
+			pmullw mm2,  mm4 	/* mul high bytes of Src1 and Src2 */
+			packuswb mm1,  mm2 	/* pack words back into bytes with saturation */
+			movq [edi],  mm1 	/* store result in Dest */
+			add eax,  8 	/* increase Src1, Src2 and Dest  */
+			add ebx,  8 	/* register pointers by 8 */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L1015       	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
+		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
+		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
+		mm1 = _m_psrlwi(mm1, 1);		/* divide mm1 words by 2, Src1 low bytes */
+		mm2 = _m_psrlwi(mm2, 1);		/* divide mm2 words by 2, Src1 high bytes */
+		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
+		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
+		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultDivby2: D = saturation255(S1/2 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = ((int) *cursrc1 / 2) * (int) *cursrc2;
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			pxor mm0, mm0   	/* zero mm0 register */
+			align 16          	/* 16 byte alignment of the loop entry */
+L1016:
+		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		movq mm3, [ebx]   	/* load 8 bytes from Src2 into mm3 */
+		movq mm2, mm1   	/* copy mm1 into mm2 */
+			movq mm4, mm3   	/* copy mm3 into mm4  */
+			punpcklbw mm1, mm0   	/* unpack low  bytes of Src1 into words */
+			punpckhbw mm2, mm0   	/* unpack high bytes of Src1 into words */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of Src2 into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of Src2 into words */
+			psrlw mm1, 1   	/* divide mm1 words by 2, Src1 low bytes */
+			psrlw mm2, 1   	/* divide mm2 words by 2, Src1 high bytes */
+			psrlw mm3, 1   	/* divide mm3 words by 2, Src2 low bytes */
+			psrlw mm4, 1   	/* divide mm4 words by 2, Src2 high bytes */
+			pmullw mm1, mm3   	/* mul low  bytes of Src1 and Src2  */
+			pmullw mm2, mm4   	/* mul high bytes of Src1 and Src2 */
+			packuswb mm1, mm2   	/* pack words back into bytes with saturation */
+			movq [edi], mm1   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add ebx, 8   	/* register pointers by 8 */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L1016       	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
+		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
+		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
+		mm1 = _m_psrlwi(mm1, 1);		/* divide mm1 words by 2, Src1 low bytes */
+		mm2 = _m_psrlwi(mm2, 1);		/* divide mm2 words by 2, Src1 high bytes */
+		mm3 = _m_psrlwi(mm3, 1);		/* divide mm3 words by 2, Src2 low bytes */
+		mm4 = _m_psrlwi(mm4, 1);		/* divide mm4 words by 2, Src2 high bytes */
+		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
+		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
+		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using BitAnd: D = S1 & S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16          	/* 16 byte alignment of the loop entry */
+L1017:
+		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		pand mm1, [ebx]   	/* mm1=Src1&Src2 */
+		movq [edi], mm1   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add ebx, 8   	/* register pointers by 8 */
+			add edi, 8
+			dec ecx        	/* decrease loop counter */
+			jnz L1017       	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* x86_64 ASM with constraints: */
+	/* asm volatile ( */
+	/* 	"shr $3, %%rcx \n\t"	/\* counter/8 (MMX loads 8 bytes at a time) *\/ */
+	/* 	".align 16       \n\t"	/\* 16 byte alignment of the loop entry *\/ */
+	/* 	"1: movq (%%rax), %%mm1 \n\t"	/\* load 8 bytes from Src1 into mm1 *\/ */
+	/* 	"pand    (%%rbx), %%mm1 \n\t"	/\* mm1=Src1&Src2 *\/ */
+	/* 	"movq    %%mm1, (%%rdi) \n\t"	/\* store result in Dest *\/ */
+	/* 	"add $8, %%rax \n\t"	/\* increase Src1, Src2 and Dest  *\/ */
+	/* 	"add $8, %%rbx \n\t"	/\* register pointers by 8 *\/ */
+	/* 	"add $8, %%rdi \n\t" */
+	/* 	"dec %%rcx     \n\t"	/\* decrease loop counter *\/ */
+	/* 	"jnz 1b        \n\t"	/\* check loop termination, proceed if required *\/ */
+	/* 	"emms          \n\t"	/\* exit MMX state *\/ */
+	/* 	: "+a" (Src1),		/\* load Src1 address into rax, modified by the loop *\/ */
+	/* 	  "+b" (Src2),		/\* load Src2 address into rbx, modified by the loop *\/ */
+	/* 	  "+c" (SrcLength),	/\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
+	/* 	  "+D" (Dest)		/\* load Dest address into rdi, modified by the loop *\/ */
+	/* 	: */
+	/* 	: "memory",		/\* *Dest is modified *\/ */
+        /*           "mm1"			/\* register mm1 modified *\/ */
+	/* ); */
+
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_pand(*mSrc1, *mSrc2);	/* Src1&Src2 */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BitAnd: D = S1 & S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
+		/*  if (length > 7) { */
+		/* Call MMX routine */
+
+		SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = (*cursrc1) & (*cursrc2);
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using BitOr: D = S1 | S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16          	/* 16 byte alignment of the loop entry */
+L91017:
+		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		por mm1, [ebx]   	/* mm1=Src1|Src2 */
+		movq [edi], mm1   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add ebx, 8   	/* register pointers by 8 */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L91017      	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_por(*mSrc1, *mSrc2);	/* Src1|Src2 */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BitOr: D = S1 | S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = *cursrc1 | *cursrc2;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+	return (0);
+}
+
+/*!
+\brief Internal ASM Filter using Div: D = S1 / S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Src1   	/* load Src1 address into edx */
+			mov esi, Src2   	/* load Src2 address into esi */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			align 16        	/* 16 byte alignment of the loop entry */
+L10191:
+		mov bl, [esi]   	/* load a byte from Src2 */
+		cmp bl, 0   	/* check if it zero */
+			jnz L10192
+			mov [edi], 255   	/* division by zero = 255 !!! */
+			jmp  L10193
+L10192:
+		xor ah, ah   	/* prepare AX, zero AH register */
+			mov al, [edx]   	/* load a byte from Src1 into AL */
+		div   bl             	/* divide AL by BL */
+			mov [edi], al   	/* move a byte result to Dest */
+L10193:
+		inc edx    	/* increment Src1, Src2, Dest */
+			inc esi    		/* pointer registers by one */
+			inc edi
+			dec ecx       	/* decrease loop counter */
+			jnz L10191     	/* check loop termination, proceed if required */
+			popa
+	}
+#else
+	/* Note: ~15% gain on i386, less efficient than C on x86_64 */
+	/* Also depends on whether the function is static (?!) */
+	/* Also depends on whether we work on malloc() or static char[] */
+	asm volatile (
+#  if defined(i386)
+		"pushl %%ebx \n\t"		/* %ebx may be the PIC register.  */
+		".align 16     \n\t"		/* 16 byte alignment of the loop entry */
+		"1: mov (%%esi), %%bl  \n\t"	/* load a byte from Src2 */
+		"cmp       $0, %%bl    \n\t"	/* check if it zero */
+		"jnz 2f                \n\t"
+		"movb  $255, (%%edi)   \n\t"	/* division by zero = 255 !!! */
+		"jmp 3f                \n\t"
+		"2: xor %%ah, %%ah     \n\t"	/* prepare AX, zero AH register */
+		"mov   (%%edx), %%al   \n\t"	/* load a byte from Src1 into AL */
+		"div   %%bl            \n\t"	/* divide AL by BL */
+		"mov   %%al, (%%edi)   \n\t"	/* move a byte result to Dest */
+		"3: inc %%edx          \n\t"	/* increment Src1, Src2, Dest */
+		"inc %%esi \n\t"		/* pointer registers by one */
+		"inc %%edi \n\t"
+		"dec %%ecx \n\t"		/* decrease loop counter */
+		"jnz 1b    \n\t"		/* check loop termination, proceed if required */
+		"popl %%ebx \n\t"		/* restore %ebx */
+		: "+d" (Src1),		/* load Src1 address into edx */
+		  "+S" (Src2),		/* load Src2 address into esi */
+		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
+		  "+D" (Dest)		/* load Dest address into edi */
+		:
+		: "memory", "rax"
+#  elif defined(__x86_64__)
+		".align 16     \n\t"		/* 16 byte alignment of the loop entry */
+		"1: mov (%%rsi), %%bl  \n\t"	/* load a byte from Src2 */
+		"cmp       $0, %%bl    \n\t"	/* check if it zero */
+		"jnz 2f                \n\t"
+		"movb  $255, (%%rdi)   \n\t"	/* division by zero = 255 !!! */
+		"jmp 3f                \n\t"
+		"2: xor %%ah, %%ah     \n\t"	/* prepare AX, zero AH register */
+		"mov   (%%rdx), %%al   \n\t"	/* load a byte from Src1 into AL */
+		"div   %%bl            \n\t"	/* divide AL by BL */
+		"mov   %%al, (%%rdi)   \n\t"	/* move a byte result to Dest */
+		"3: inc %%rdx          \n\t"	/* increment Src1, Src2, Dest */
+		"inc %%rsi \n\t"		/* pointer registers by one */
+		"inc %%rdi \n\t"
+		"dec %%rcx \n\t"		/* decrease loop counter */
+		"jnz 1b    \n\t"		/* check loop termination, proceed if required */
+		: "+d" (Src1),		/* load Src1 address into edx */
+		  "+S" (Src2),		/* load Src2 address into esi */
+		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
+		  "+D" (Dest)		/* load Dest address into edi */
+		:
+		: "memory", "rax", "rbx"
+#  endif
+		);
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Div: D = S1 / S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (SDL_imageFilterMMXdetect()) {
+		if (length > 0) {
+			/* Call ASM routine */
+			SDL_imageFilterDivASM(Src1, Src2, Dest, length);
+
+			/* Never unaligned bytes - we are done */
+			return (0);
+		} else {
+			return (-1);
+		}
+	} 
+	
+	/* Setup to process whole image */
+	istart = 0;
+	cursrc1 = Src1;
+	cursrc2 = Src2;
+	curdst = Dest;
+
+	/* C routine to process image */
+	/* for (i = istart; i < length; i++) { */
+	/* 	if (*cursrc2 == 0) { */
+	/* 		*curdst = 255; */
+	/* 	} else { */
+	/* 		result = (int) *cursrc1 / (int) *cursrc2; */
+	/* 		*curdst = (unsigned char) result; */
+	/* 	} */
+	/* 	/\* Advance pointers *\/ */
+	/* 	cursrc1++; */
+	/* 	cursrc2++; */
+	/* 	curdst++; */
+	/* } */
+	for (i = istart; i < length; i++) {
+		if (*cursrc2 == 0) {
+			*curdst = 255;
+		} else {
+			*curdst = (int)*cursrc1 / (int)*cursrc2;  // (int) for efficiency
+		}
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Internal MMX Filter using BitNegation: D = !S
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16          	/* 16 byte alignment of the loop entry */
+L91117:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		pxor mm0, mm1   	/* negate mm0 by xoring with mm1 */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L91117      	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+        __m64 mm1;
+	mm1 = _m_pcmpeqb(mm1, mm1);		/* generate all 1's in mm1 */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_pxor(*mSrc1, mm1);	/* negate mm0 by xoring with mm1 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BitNegation: D = !S
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterBitNegationMMX(Src1, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = ~(*cursrc1);
+		/* Advance pointers */
+		cursrc1++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AddByte: D = saturation255(S + C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant value to add (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 8 bytes of MM1 ** */
+			mov al, C   	/* load C into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1021:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
+		paddusb mm0,  mm1 	/* MM0=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Dest register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1021    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate C in 8 bytes of MM1 */
+	int i;
+	memset(&i, C, 4);
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_paddusb(*mSrc1, mm1);	/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AddByte: D = saturation255(S + C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant value to add (C).
+
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1, *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterAddByteMMX(Src1, Dest, length, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 + iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to add (C).
+\param D Byteorder-swapped constant to add (Cs).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate (int)C in 8 bytes of MM1 ** */
+			mov eax, C   	/* load C into EAX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			mov eax, D   	/* load D into EAX */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L11023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		paddusb mm0,  mm1 	/* MM0=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi],  mm0 	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L11023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate (int)C in 8 bytes of MM1 */
+	__m64 mm1 = _m_from_int(C);
+	__m64 mm2 = _m_from_int(C);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_paddusb(*mSrc1, mm1);	/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant to add (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
+{
+	unsigned int i, j, istart, D;
+	int iC[4];
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		D=SWAP_32(C);
+		SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process bytes */
+	iC[3] = (int) ((C >> 24) & 0xff);
+	iC[2] = (int) ((C >> 16) & 0xff);
+	iC[1] = (int) ((C >>  8) & 0xff);
+	iC[0] = (int) ((C >>  0) & 0xff);
+	for (i = istart; i < length; i += 4) {
+		for (j = 0; j < 4; j++) {
+			if ((i+j)<length) {
+				result = (int) *cursrc1 + iC[j];
+				if (result > 255) result = 255;
+				*curdest = (unsigned char) result;
+				/* Advance pointers */
+				cursrc1++;
+				curdest++;
+			}
+		}
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AddByteToHalf: D = saturation255(S/2 + C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to add (C).
+\param Mask Pointer to 8 mask bytes of value 0x7F.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
+									unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 8 bytes of MM1 ** */
+			mov al, C   	/* load C into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov edx, Mask   	/* load Mask address into edx */
+			movq mm0, [edx]   	/* load Mask into mm0 */
+		mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1022:
+		movq mm2, [eax]   	/* load 8 bytes from Src1 into MM2 */
+		psrlw mm2, 1   	/* shift 4 WORDS of MM2 1 bit to the right */
+			pand mm2, mm0        // apply Mask to 8 BYTES of MM2 */
+			paddusb mm2,  mm1 	/* MM2=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi], mm2   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1022    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+	/* Duplicate C in 8 bytes of MM1 */
+	int i;
+	memset(&i, C, 4);
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm2 = _m_psrlwi(*mSrc1, 1);	/* shift 4 WORDS of MM2 1 bit to the right */
+		mm2 = _m_pand(mm2, *mMask);		/* apply Mask to 8 BYTES of MM2 */
+							/* byte     0x0f, 0xdb, 0xd0 */
+		*mDest = _m_paddusb(mm1, mm2);		/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AddByteToHalf: D = saturation255(S/2 + C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant to add (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) (*cursrc1 / 2) + iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using SubByte: D = saturation0(S - C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to subtract (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 8 bytes of MM1 ** */
+			mov al, C   	/* load C into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psubusb mm0,  mm1 	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate C in 8 bytes of MM1 */
+	int i;
+	memset(&i, C, 4);
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psubusb(*mSrc1, mm1);	/* Src1-C (sub 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using SubByte: D = saturation0(S - C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+\param C Constant to subtract (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterSubByteMMX(Src1, Dest, length, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 - iC;
+		if (result < 0)
+			result = 0;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to subtract (C).
+\param D Byteorder-swapped constant to subtract (Cs).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate (int)C in 8 bytes of MM1 ** */
+			mov eax, C   	/* load C into EAX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			mov eax, D   	/* load D into EAX */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L11024:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psubusb mm0, mm1 	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L11024    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate (int)C in 8 bytes of MM1 */
+	__m64 mm1 = _m_from_int(C);
+	__m64 mm2 = _m_from_int(C);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psubusb(*mSrc1, mm1);	/* Src1-C (sub 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant to subtract (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
+{
+	unsigned int i, j, istart, D;
+	int iC[4];
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+    /* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		D=SWAP_32(C);
+		SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC[3] = (int) ((C >> 24) & 0xff);
+	iC[2] = (int) ((C >> 16) & 0xff);
+	iC[1] = (int) ((C >>  8) & 0xff);
+	iC[0] = (int) ((C >>  0) & 0xff);
+	for (i = istart; i < length; i += 4) {
+		for (j = 0; j < 4; j++) {
+			if ((i+j)<length) {
+				result = (int) *cursrc1 - iC[j];
+				if (result < 0) result = 0;
+				*curdest = (unsigned char) result;
+				/* Advance pointers */
+				cursrc1++;
+				curdest++;
+			}
+		}
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftRight: D = saturation0(S >> N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param Mask Byte array containing 8 bytes with 0x7F value.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
+								 unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Mask   	/* load Mask address into edx */
+			movq mm0, [edx]   	/* load Mask into mm0 */
+		xor ecx, ecx   	/* zero ECX */
+			mov cl,  N 	/* load loop counter (N) into CL */
+			movd mm3,  ecx 	/* copy (N) into MM3  */
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+L10240:                  	/* ** Prepare proper bit-Mask in MM1 ** */
+		psrlw mm1,  1 	/* shift 4 WORDS of MM1 1 bit to the right */
+			pand mm1, mm0   // apply Mask to 8 BYTES of MM1 */
+			/*  byte     0x0f, 0xdb, 0xc8 */
+			dec               cl    	/* decrease loop counter */
+			jnz            L10240    	/* check loop termination, proceed if required */
+			/* ** Shift all bytes of the image ** */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10241:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psrlw mm0, mm3   	/* shift 4 WORDS of MM0 (N) bits to the right */
+			pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
+			/* byte     0x0f, 0xdb, 0xc1 */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10241    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+        __m64 mm1;
+	int i;
+	mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
+	/* Prepare proper bit-Mask in MM1 */
+	for (i = 0; i < N; i++) {
+		mm1 = _m_psrlwi(mm1, 1);		/* shift 4 WORDS of MM1 1 bit to the right */
+		mm1 = _m_pand(mm1, *mMask);		/* apply Mask to 8 BYTES of MM1 */
+	}
+        /* Shift all bytes of the image */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0 = _m_psrlwi(*mSrc1, N);	/* shift 4 WORDS of MM0 (N) bits to the right */
+		*mDest = _m_pand(mm0, mm1);		/* apply proper bit-Mask to 8 BYTES of MM0 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftRight: D = saturation0(S >> N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
+	unsigned int i, istart;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Check shift */
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdest = (unsigned char) *cursrc1 >> N;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L13023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psrld mm0, N
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L13023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psrldi(*mSrc1, N);
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	unsigned int *icursrc1, *icurdest;
+	unsigned int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 32) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	icursrc1=(unsigned int *)cursrc1;
+	icurdest=(unsigned int *)curdest;
+	for (i = istart; i < length; i += 4) {
+		if ((i+4)<length) {
+			result = ((unsigned int)*icursrc1 >> N);
+			*icurdest = result;
+		}
+		/* Advance pointers */
+		icursrc1++;
+		icurdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using MultByByte: D = saturation255(S * C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 4 words of MM1 ** */
+			mov al, C   	/* load C into AL */
+			xor ah, ah   	/* zero AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher words of MM1 with C */
+			pxor mm0, mm0   	/* zero MM0 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			cmp al, 128   	/* if (C <= 128) execute more efficient code */
+			jg             L10251
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10250:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			pmullw mm3, mm1   	/* mul low  bytes of SrcDest and MM1 */
+			pmullw mm4, mm1   	/* mul high bytes of SrcDest and MM1 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10250    	/* check loop termination, proceed if required */
+			jmp            L10252
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10251:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			pmullw mm3, mm1   	/* mul low  bytes of SrcDest and MM1 */
+			pmullw mm4, mm1   	/* mul high bytes of SrcDest and MM1 */
+			/* ** Take abs value of the results (signed words) ** */
+			movq mm5, mm3   	/* copy mm3 into mm5 */
+			movq mm6, mm4   	/* copy mm4 into mm6 */
+			psraw mm5, 15   	/* fill mm5 words with word sign bit */
+			psraw mm6, 15   	/* fill mm6 words with word sign bit */
+			pxor mm3, mm5   	/* take 1's compliment of only neg words */
+			pxor mm4, mm6   	/* take 1's compliment of only neg words */
+			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
+			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10251    	/* check loop termination, proceed if required */
+L10252:
+		emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0);				/* zero mm0 register */
+	/* Duplicate C in 4 words of MM1 */
+	int i;
+	i = C | C<<16;
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);				/* fill higher words of MM1 with C */
+	// long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48;
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	if (C <= 128) {						/* if (C <= 128) execute more efficient code */
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
+			mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	} else {
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4, mm5, mm6;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
+			mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
+			/* Take abs value of the results (signed words) */
+			mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
+			mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
+			mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
+			mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
+			mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	}
+	_m_empty();						/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultByByte: D = saturation255(S * C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==1 */
+	if (C == 1) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 * iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftRightAndMultByByteMMX: D = saturation255((S >> N) * C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
+											  unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 4 words of MM1 ** */
+			mov al, C   	/* load C into AL */
+			xor ah, ah   	/* zero AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher words of MM1 with C */
+			xor ecx, ecx   	/* zero ECX */
+			mov cl, N   	/* load N into CL */
+			movd mm7, ecx   	/* copy N into MM7 */
+			pxor mm0, mm0   	/* zero MM0 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1026:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			psrlw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the right */
+			psrlw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the right */
+			pmullw mm3, mm1   	/* mul low  bytes of SrcDest by MM1 */
+			pmullw mm4, mm1   	/* mul high bytes of SrcDest by MM1 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1026    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0);			/* zero mm0 register */
+	/* Duplicate C in 4 words of MM1 */
+	int i;
+	i = (C<<16)|C;
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher words of MM1 with C */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm3, mm4, mm5, mm6;
+		mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_psrlwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the right */
+		mm4 = _m_psrlwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the right */
+		mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
+		mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
+		*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftRightAndMultByByte: D = saturation255((S >> N) * C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
+										   unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Check shift */
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 && C==1 */
+	if ((N == 0) && (C == 1)) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) (*cursrc1 >> N) * iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftLeftByte: D = (S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param Mask Byte array containing 8 bytes of 0xFE value.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
+									unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Mask   	/* load Mask address into edx */
+			movq mm0, [edx]   	/* load Mask into mm0 */
+		xor ecx, ecx   	/* zero ECX */
+			mov cl, N   	/* load loop counter (N) into CL */
+			movd mm3, ecx   	/* copy (N) into MM3  */
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+L10270:                  	/* ** Prepare proper bit-Mask in MM1 ** */
+		psllw mm1, 1   	/* shift 4 WORDS of MM1 1 bit to the left */
+			pand mm1, mm0        // apply Mask to 8 BYTES of MM1 */
+			/*  byte     0x0f, 0xdb, 0xc8 */
+			dec cl                  	/* decrease loop counter */
+			jnz            L10270    	/* check loop termination, proceed if required */
+			/* ** Shift all bytes of the image ** */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load SrcDest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10271:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
+		psllw mm0, mm3   	/* shift 4 WORDS of MM0 (N) bits to the left */
+			pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
+			/* byte     0x0f, 0xdb, 0xc1 */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10271    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+        __m64 mm1;
+	int i;
+	mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
+	/* Prepare proper bit-Mask in MM1 */
+	for (i = 0; i < N; i++) {
+		mm1 = _m_psllwi(mm1, 1);		/* shift 4 WORDS of MM1 1 bit to the left */
+		mm1 = _m_pand(mm1, *mMask);		/* apply Mask to 8 BYTES of MM1 */
+	}
+	/* ** Shift all bytes of the image ** */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0 = _m_psllwi(*mSrc1, N);	/* shift 4 WORDS of MM0 (N) bits to the left */
+		*mDest = _m_pand(mm0, mm1);		/* apply proper bit-Mask to 8 BYTES of MM0 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftLeftByte: D = (S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = ((int) *cursrc1 << N) & 0xff;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftLeftUint: D = ((uint)S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L12023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		pslld mm0, N   	/* MM0=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L12023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_pslldi(*mSrc1, N);	/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftLeftUint: D = ((uint)S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	unsigned int *icursrc1, *icurdest;
+	unsigned int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 32) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	icursrc1=(unsigned int *)cursrc1;
+	icurdest=(unsigned int *)curdest;
+	for (i = istart; i < length; i += 4) {
+		if ((i+4)<length) {
+			result = ((unsigned int)*icursrc1 << N);
+			*icurdest = result;
+		}
+		/* Advance pointers */
+		icursrc1++;
+		icurdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter ShiftLeft: D = saturation255(S << N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			xor eax, eax   	/* zero EAX */
+			mov al, N   	/* load N into AL */
+			movd mm7, eax   	/* copy N into MM7 */
+			pxor mm0, mm0   	/* zero MM0 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			cmp al, 7   	/* if (N <= 7) execute more efficient code */
+			jg             L10281
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10280:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			psllw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the left */
+			psllw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the left */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10280    	/* check loop termination, proceed if required */
+			jmp            L10282
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10281:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			psllw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the left */
+			psllw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the left */
+			/* ** Take abs value of the signed words ** */
+			movq mm5, mm3   	/* copy mm3 into mm5 */
+			movq mm6, mm4   	/* copy mm4 into mm6 */
+			psraw mm5, 15   	/* fill mm5 words with word sign bit */
+			psraw mm6, 15   	/* fill mm6 words with word sign bit */
+			pxor mm3, mm5   	/* take 1's compliment of only neg words */
+			pxor mm4, mm6   	/* take 1's compliment of only neg words */
+			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
+			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10281    	/* check loop termination, proceed if required */
+L10282:
+		emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0);				/* zero mm0 register */
+	int i;
+	if (N <= 7) {						/* if (N <= 7) execute more efficient code */
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_psllwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the left */
+			mm4 = _m_psllwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the left */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	} else {
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4, mm5, mm6;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_psllwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the left */
+			mm4 = _m_psllwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the left */
+			/* Take abs value of the signed words */
+			mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
+			mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
+			mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
+			mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
+			mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	}
+	_m_empty();						/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter ShiftLeft: D = saturation255(S << N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 << N;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief MMX BinarizeUsingThreshold: D = (S >= T) ? 255:0
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param T The threshold boundary (inclusive).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate T in 8 bytes of MM3 ** */
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+			pcmpeqb mm2, mm2   	/* generate all 1's in mm2 */
+			mov al, T   	/* load T into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm3, eax   	/* copy EAX into MM3 */
+			movd mm4, eax   	/* copy EAX into MM4 */
+			punpckldq mm3, mm4   	/* fill higher bytes of MM3 with T */
+			psubusb mm2, mm3   	/* store 0xFF - T in MM2 */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1029:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		paddusb mm0, mm2   	/* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
+			pcmpeqb mm0, mm1   	/* binarize 255:0, comparing to 255 */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1029    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate T in 8 bytes of MM3 */
+	__m64 mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
+	__m64 mm2 = _m_pcmpeqb(mm2, mm2);			/* generate all 1's in mm1 */
+	int i;
+	memset(&i, T, 4);
+	__m64 mm3 = _m_from_int(i);
+	__m64 mm4 = _m_from_int(i);
+	mm3 = _m_punpckldq(mm3, mm4);			/* fill higher bytes of MM3 with T */
+	mm2 = _m_psubusb(mm2, mm3);			/* store 0xFF - T in MM2 */
+        //__m64 mm3 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0 = _m_paddusb(*mSrc1, mm2);	/* Src1+(0xFF-T) (add 8 bytes with saturation) */
+		*mDest = _m_pcmpeqb(mm0, mm1);		/* binarize 255:0, comparing to 255 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BinarizeUsingThreshold: D = (S >= T) ? 255:0
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param T The threshold boundary (inclusive).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: T==0 */
+	if (T == 0) {
+		memset(Dest, 255, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0);
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param Tmin Lower (inclusive) boundary of the clipping range.
+\param Tmax Upper (inclusive) boundary of the clipping range.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
+								  unsigned char Tmax)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+			/* ** Duplicate Tmax in 8 bytes of MM3 ** */
+			mov al, Tmax   	/* load Tmax into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm3, eax   	/* copy EAX into MM3 */
+			movd mm4, eax   	/* copy EAX into MM4 */
+			punpckldq mm3, mm4   	/* fill higher bytes of MM3 with Tmax */
+			psubusb mm1, mm3   	/* store 0xFF - Tmax in MM1 */
+			/* ** Duplicate Tmin in 8 bytes of MM5 ** */
+			mov al, Tmin   	/* load Tmin into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm5, eax   	/* copy EAX into MM5 */
+			movd mm4, eax   	/* copy EAX into MM4 */
+			punpckldq mm5, mm4   	/* fill higher bytes of MM5 with Tmin */
+			movq mm7, mm5   	/* copy MM5 into MM7 */
+			paddusb mm7, mm1   	/* store 0xFF - Tmax + Tmin in MM7 */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1030:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
+		paddusb mm0, mm1   	/* MM0=SrcDest+(0xFF-Tmax) */
+			psubusb mm0, mm7   	/* MM0=MM0-(0xFF-Tmax+Tmin) */
+			paddusb mm0, mm5   	/* MM0=MM0+Tmin */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1030    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm1 = _m_pcmpeqb(mm1, mm1);	/* generate all 1's in mm1 */
+	int i;
+	/* Duplicate Tmax in 8 bytes of MM3 */
+	__m64 mm3, mm4;
+	memset(&i, Tmax, 4);
+	mm3 = _m_from_int(i);
+	mm4 = _m_from_int(i);
+	mm3 = _m_punpckldq(mm3, mm4);		/* fill higher bytes of MM3 with Tmax */
+	mm1 = _m_psubusb(mm1, mm3);		/* store 0xFF - Tmax in MM1 */
+        //__m64 mm3 = _m_from_int64(lli); // x86_64 only
+	/* Duplicate Tmax in 8 bytes of MM3 */
+	__m64 mm5, mm7;
+	memset(&i, Tmin, 4);
+	mm5 = _m_from_int(i);
+	mm4 = _m_from_int(i);
+	mm5 = _m_punpckldq(mm5, mm4);		/* fill higher bytes of MM5 with Tmin */
+	mm7 = _m_paddusb(mm5, mm1);	/* store 0xFF - Tmax + Tmin in MM7 */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0;
+		mm0 = _m_paddusb(*mSrc1, mm1);	/* MM0=Src1+(0xFF-Tmax) */
+		mm0 = _m_psubusb(mm0, mm7);	/* MM0=MM0-(0xFF-Tmax+Tmin) */
+		*mDest = _m_paddusb(mm0, mm5);	/* MM0+Tmin */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param Tmin Lower (inclusive) boundary of the clipping range.
+\param Tmax Upper (inclusive) boundary of the clipping range.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
+							   unsigned char Tmax)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: Tmin==0 && Tmax = 255 */
+	if ((Tmin == 0) && (Tmax == 25)) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		if (*cursrc1 < Tmin) {
+			*curdest = Tmin;
+		} else if (*cursrc1 > Tmax) {
+			*curdest = Tmax;
+		} else {
+			*curdest = *cursrc1;
+		}
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param Cmin Normalization constant (Cmin).
+\param Cmax Normalization constant (Cmax).
+\param Nmin Normalization constant (Nmin).
+\param Nmax Normalization constant (Nmax).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
+									  int Nmin, int Nmax)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov ax, WORD PTR Nmax   	/* load Nmax in AX */
+			mov bx, WORD PTR Cmax   	/* load Cmax in BX */
+			sub ax, WORD PTR Nmin   	/* AX = Nmax - Nmin */
+			sub bx, WORD PTR Cmin   	/* BX = Cmax - Cmin */
+			jz             L10311    	/* check division by zero */
+			xor dx, dx   	/* prepare for division, zero DX */
+			div               bx    	/* AX = AX/BX */
+			jmp            L10312
+L10311:
+		mov ax, 255   	/* if div by zero, assume result max byte value */
+L10312:                  	/* ** Duplicate AX in 4 words of MM0 ** */
+		mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm0, eax   	/* copy EAX into MM0 */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			punpckldq mm0, mm1   	/* fill higher words of MM0 with AX */
+			/* ** Duplicate Cmin in 4 words of MM1 ** */
+			mov ax, WORD PTR Cmin   	/* load Cmin into AX */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher words of MM1 with Cmin */
+			/* ** Duplicate Nmin in 4 words of MM2 ** */
+			mov ax, WORD PTR Nmin   	/* load Nmin into AX */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			movd mm3, eax   	/* copy EAX into MM3 */
+			punpckldq mm2, mm3   	/* fill higher words of MM2 with Nmin */
+			pxor mm7, mm7   	/* zero MM7 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1031:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm7   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm7   	/* unpack high bytes of SrcDest into words */
+			psubusb mm3, mm1   	/* S-Cmin, low  bytes */
+			psubusb mm4, mm1   	/* S-Cmin, high bytes */
+			pmullw mm3, mm0   	/* MM0*(S-Cmin), low  bytes */
+			pmullw mm4, mm0   	/* MM0*(S-Cmin), high bytes */
+			paddusb mm3, mm2   	/* MM0*(S-Cmin)+Nmin, low  bytes */
+			paddusb mm4, mm2   	/* MM0*(S-Cmin)+Nmin, high bytes */
+			/* ** Take abs value of the signed words ** */
+			movq mm5, mm3   	/* copy mm3 into mm5 */
+			movq mm6, mm4   	/* copy mm4 into mm6 */
+			psraw mm5, 15   	/* fill mm5 words with word sign bit */
+			psraw mm6, 15   	/* fill mm6 words with word sign bit */
+			pxor mm3, mm5   	/* take 1's compliment of only neg words */
+			pxor mm4, mm6   	/* take 1's compliment of only neg words */
+			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
+			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1031    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0, mm1, mm2, mm3;
+
+	int i;
+	/* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */
+	unsigned short a = Nmax - Nmin;
+	unsigned short b = Cmax - Cmin;
+	if (b == 0) {
+	    a = 255;
+	} else {
+	    a /= b;
+	}
+	i = (a<<16)|a;
+	mm0 = _m_from_int(i);
+	mm1 = _m_from_int(i);
+	mm0 = _m_punpckldq(mm0, mm1);			/* fill higher words of MM0 with AX */
+	/* Duplicate Cmin in 4 words of MM1 */
+	i = (Cmin<<16)|(short)Cmin;
+	mm1 = _m_from_int(i);
+	mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher words of MM1 with Cmin */
+	/* Duplicate Nmin in 4 words of MM2 */
+	i = (Nmin<<16)|(short)Nmin;
+	mm2 = _m_from_int(i);
+	mm3 = _m_from_int(i);
+	mm2 = _m_punpckldq(mm2, mm3);			/* fill higher words of MM2 with Nmin */
+	__m64 mm7 = _m_from_int(0);			/* zero mm0 register */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm3, mm4, mm5, mm6;
+		mm3 = _m_punpcklbw(*mSrc1, mm7);	/* unpack low  bytes of Src1 into words */
+		mm4 = _m_punpckhbw(*mSrc1, mm7);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_psubusb(mm3, mm1);		/* S-Cmin, low	bytes */
+		mm4 = _m_psubusb(mm4, mm1);		/* S-Cmin, high bytes */
+		mm3 = _m_pmullw(mm3, mm0);		/* MM0*(S-Cmin), low  bytes */
+		mm4 = _m_pmullw(mm4, mm0);		/* MM0*(S-Cmin), high bytes */
+		mm3 = _m_paddusb(mm3, mm2);		/* MM0*(S-Cmin)+Nmin, low  bytes */
+		mm4 = _m_paddusb(mm4, mm2);		/* MM0*(S-Cmin)+Nmin, high bytes */
+		/* Take abs value of the signed words */
+		mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
+		mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
+		mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
+		mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
+		mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
+
+\param Src Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param Cmin Normalization constant.
+\param Cmax Normalization constant.
+\param Nmin Normalization constant.
+\param Nmax Normalization constant.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
+								   int Nmax)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc;
+	unsigned char *curdest;
+	int dN, dC, factor;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc = &Src[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc = Src;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	dC = Cmax - Cmin;
+	if (dC == 0)
+		return (0);
+	dN = Nmax - Nmin;
+	factor = dN / dC;
+	for (i = istart; i < length; i++) {
+		result = factor * ((int) (*cursrc) - Cmin) + Nmin;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Filter using ConvolveKernel3x3Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >2.
+\param Kernel The 2D convolution kernel of size 3x3.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 3) || (rows < 3) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				movq mm5, [edx]   	/* MM5 = {0,K2,K1,K0} */
+			add edx, 8   	/* second row              |K0 K1 K2 0| */
+				movq mm6, [edx]   	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			add edx, 8   	/* third row               |K6 K7 K8 0| */
+				movq mm7, [edx]   	/* MM7 = {0,K8,K7,K6} */
+			/* ---, */
+			mov eax, columns   	/* load columns into EAX */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				mov edx, rows   	/* initialize ROWS counter */
+				sub edx, 2   	/* do not use first and last row */
+				/* ---, */
+L10320:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				sub ecx, 2   	/* do not use first and last column */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10322:
+			/* ---, */
+			movq mm1, [esi]   	/* load 8 bytes of the image first row */
+			add esi, eax   	/* move one row below */
+				movq mm2, [esi]   	/* load 8 bytes of the image second row */
+			add esi, eax   	/* move one row below */
+				movq mm3, [esi]   	/* load 8 bytes of the image third row */
+			punpcklbw mm1, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm2, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm3, mm0   	/* unpack first 4 bytes into words */
+				pmullw mm1, mm5   	/* multiply words first row  image*Kernel */
+				pmullw mm2, mm6   	/* multiply words second row image*Kernel */
+				pmullw mm3, mm7   	/* multiply words third row  image*Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the first and second rows */
+				paddsw mm1, mm3   	/* add 4 words of the third row and result */
+				movq mm2, mm1   	/* copy MM1 into MM2 */
+				psrlq mm1, 32   	/* shift 2 left words to the right */
+				paddsw mm1, mm2   	/* add 2 left and 2 right result words */
+				movq mm3, mm1   	/* copy MM1 into MM3 */
+				psrlq mm1, 16   	/* shift 1 left word to the right */
+				paddsw mm1, mm3   	/* add 1 left and 1 right result words */
+				/* --, */
+				movd mm2, eax   	/* save EAX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm1   	/* copy MM1 into EAX */
+				psraw mm1, 15   	/* spread sign bit of the result */
+				movd edx, mm1   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm1, eax   	/* move result of division into MM1 */
+				packuswb mm1, mm0   	/* pack division result with saturation */
+				movd eax, mm1   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd eax, mm2   	/* restore saved EAX */
+				/* --, */
+				sub esi, eax   	/* move two rows up */
+				sub esi, eax   	/* */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10322    	/* check loop termination, proceed if required */
+				add esi, 2   	/* move to the next row in Src */
+				add edi, 2   	/* move to the next row in Dest */
+				dec              edx    	/* decrease loop counter ROWS */
+				jnz            L10320    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"movq    (%%edx), %%mm5 \n\t"	/* MM5 = {0,K2,K1,K0} */
+			"add          $8, %%edx \n\t"	/* second row              |K0 K1 K2 0| */
+			"movq    (%%edx), %%mm6 \n\t"	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			"add          $8, %%edx \n\t"	/* third row               |K6 K7 K8 0| */
+			"movq    (%%edx), %%mm7 \n\t"	/* MM7 = {0,K8,K7,K6} */
+			/* --- */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
+			"sub          $2, %%edx \n\t"	/* do not use first and last row */
+			/* --- */
+			".L10320:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"sub          $2, %%ecx \n\t"	/* do not use first and last column */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10322:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the image first row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes of the image second row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm3 \n\t"	/* load 8 bytes of the image third row */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack first 4 bytes into words */
+			"pmullw    %%mm5, %%mm1 \n\t"	/* multiply words first row  image*Kernel */
+			"pmullw    %%mm6, %%mm2 \n\t"	/* multiply words second row image*Kernel */
+			"pmullw    %%mm7, %%mm3 \n\t"	/* multiply words third row  image*Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the first and second rows */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 4 words of the third row and result */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"psrlq       $32, %%mm1 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm1, %%mm3 \n\t"	/* copy MM1 into MM3 */
+			"psrlq       $16, %%mm1 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 1 left and 1 right result words */
+			/* -- */
+			"movd      %%eax, %%mm2 \n\t"	/* save EAX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm1, %%eax \n\t"	/* copy MM1 into EAX */
+			"psraw       $15, %%mm1 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm1, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm1 \n\t"	/* move result of division into MM1 */
+			"packuswb  %%mm0, %%mm1 \n\t"	/* pack division result with saturation */
+			"movd      %%mm1, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"sub       %%eax, %%esi \n\t"	/* move two rows up */
+			"sub       %%eax, %%esi \n\t"	/* */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10322 \n\t"	/* check loop termination, proceed if required */
+			"add          $2, %%esi \n\t"	/* move to the next row in Src */
+			"add          $2, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10320 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel5x5Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >4.
+\param columns Number of columns in source/destination array. Must be >4.
+\param Kernel The 2D convolution kernel of size 5x5.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 5) || (rows < 5) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				movd mm5, ebx   	/* copy Divisor into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 2   	/* 2 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				shl eax, 1   	/* EAX = columns * 2 */
+				add edi, eax   	/* 2 row offset from the top edge */
+				shr eax, 1   	/* EAX = columns */
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 4   	/* do not use first 2 and last 2 rows */
+				/* ---, */
+L10330:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 4   	/* do not use first 2 and last 2 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10332:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				/* ---, */
+				movd mm1, eax   	/* save EDX in MM1 */
+				movd mm2, ebx   	/* save EDX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm7   	/* load summation result into EAX */
+				psraw mm7, 15   	/* spread sign bit of the result */
+				movd ebx, mm5   	/* load Divisor into EBX */
+				movd edx, mm7   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm7, eax   	/* move result of division into MM7 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd ebx, mm2   	/* restore saved EBX */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 72   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10332    	/* check loop termination, proceed if required */
+				add esi, 4   	/* move to the next row in Src */
+				add edi, 4   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10330    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $2, %%edi \n\t"	/* 2 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"shl          $1, %%eax \n\t"	/* EAX = columns * 2 */
+			"add       %%eax, %%edi \n\t"	/* 2 row offset from the top edge */
+			"shr          $1, %%eax \n\t"	/* EAX = columns */
+			"mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $4, %%ebx \n\t"	/* do not use first 2 and last 2 rows */
+			/* --- */
+			".L10330:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $4, %%ecx \n\t"	/* do not use first 2 and last 2 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10332:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			/* --- */
+			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
+			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
+			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
+			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub         $72, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10332 \n\t"	/* check loop termination, proceed if required */
+			"add          $4, %%esi \n\t"	/* move to the next row in Src */
+			"add          $4, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10330 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel7x7Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >6.
+\param columns Number of columns in source/destination array. Must be >6.
+\param Kernel The 2D convolution kernel of size 7x7.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 7) || (rows < 7) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				movd mm5, ebx   	/* copy Divisor into MM5 */
+				mov edx, Kernel  	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 3   	/* 3 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 3 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 6   	/* do not use first 3 and last 3 rows */
+				/* ---, */
+L10340:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 6   	/* do not use first 3 and last 3 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10342:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				/* ---, */
+				movd mm1, eax   	/* save EDX in MM1 */
+				movd mm2, ebx   	/* save EDX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm7   	/* load summation result into EAX */
+				psraw mm7, 15   	/* spread sign bit of the result */
+				movd ebx, mm5   	/* load Divisor into EBX */
+				movd edx, mm7   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm7, eax   	/* move result of division into MM7 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd ebx, mm2   	/* restore saved EBX */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 104   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10342    	/* check loop termination, proceed if required */
+				add esi, 6   	/* move to the next row in Src */
+				add edi, 6   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10340    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $3, %%edi \n\t"	/* 3 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 3 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $6, %%ebx \n\t"	/* do not use first 3 and last 3 rows */
+			/* --- */
+			".L10340:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $6, %%ecx \n\t"	/* do not use first 3 and last 3 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10342:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			/* --- */
+			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
+			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
+			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
+			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $104, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10342 \n\t"	/* check loop termination, proceed if required */
+			"add          $6, %%esi \n\t"	/* move to the next row in Src */
+			"add          $6, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10340 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel9x9Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >8.
+\param columns Number of columns in source/destination array. Must be >8.
+\param Kernel The 2D convolution kernel of size 9x9.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 9) || (rows < 9) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				movd mm5, ebx   	/* copy Divisor into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 4   	/* 4 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 4 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 8   	/* do not use first 4 and last 4 rows */
+				/* ---, */
+L10350:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 8   	/* do not use first 4 and last 4 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10352:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 8 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 9 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm3, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				/* ---, */
+				movd mm1, eax   	/* save EDX in MM1 */
+				movd mm2, ebx   	/* save EDX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm7   	/* load summation result into EAX */
+				psraw mm7, 15   	/* spread sign bit of the result */
+				movd ebx, mm5   	/* load Divisor into EBX */
+				movd edx, mm7   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm7, eax   	/* move result of division into MM7 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd ebx, mm2   	/* restore saved EBX */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 208   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10352    	/* check loop termination, proceed if required */
+				add esi, 8   	/* move to the next row in Src */
+				add edi, 8   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10350    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $4, %%edi \n\t"	/* 4 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 4 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $8, %%ebx \n\t"	/* do not use first 4 and last 4 rows */
+			/* --- */
+			".L10350:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $8, %%ecx \n\t"	/* do not use first 4 and last 4 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10352:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 8 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 9 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			/* --- */
+			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
+			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
+			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
+			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $208, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10352 \n\t"	/* check loop termination, proceed if required */
+			"add          $8, %%esi \n\t"	/* move to the next row in Src */
+			"add          $8, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10350 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel3x3ShiftRight: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >2.
+\param Kernel The 2D convolution kernel of size 3x3.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 3) || (rows < 3) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm4, ebx   	/* copy NRightShift into MM4 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				movq mm5, [edx]   	/* MM5 = {0,K2,K1,K0} */
+			add edx, 8   	/* second row              |K0 K1 K2 0| */
+				movq mm6, [edx]   	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			add edx, 8   	/* third row               |K6 K7 K8 0| */
+				movq mm7, [edx]   	/* MM7 = {0,K8,K7,K6} */
+			/* ---, */
+			mov eax, columns   	/* load columns into EAX */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				mov edx, rows   	/* initialize ROWS counter */
+				sub edx, 2   	/* do not use first and last row */
+				/* ---, */
+L10360:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				sub ecx, 2   	/* do not use first and last column */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10362:
+			/* ---, */
+			movq mm1, [esi]   	/* load 8 bytes of the image first row */
+			add esi, eax   	/* move one row below */
+				movq mm2, [esi]   	/* load 8 bytes of the image second row */
+			add esi, eax   	/* move one row below */
+				movq mm3, [esi]   	/* load 8 bytes of the image third row */
+			punpcklbw mm1, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm2, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm3, mm0   	/* unpack first 4 bytes into words */
+				psrlw mm1, mm4   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm4   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm4   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm5   	/* multiply words first row  image*Kernel */
+				pmullw mm2, mm6   	/* multiply words second row image*Kernel */
+				pmullw mm3, mm7   	/* multiply words third row  image*Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the first and second rows */
+				paddsw mm1, mm3   	/* add 4 words of the third row and result */
+				movq mm2, mm1   	/* copy MM1 into MM2 */
+				psrlq mm1, 32   	/* shift 2 left words to the right */
+				paddsw mm1, mm2   	/* add 2 left and 2 right result words */
+				movq mm3, mm1   	/* copy MM1 into MM3 */
+				psrlq mm1, 16   	/* shift 1 left word to the right */
+				paddsw mm1, mm3   	/* add 1 left and 1 right result words */
+				packuswb mm1, mm0   	/* pack shift result with saturation */
+				movd ebx, mm1   	/* copy saturated result into EBX */
+				mov [edi], bl   	/* copy a byte result into Dest */
+				/* --, */
+				sub esi, eax   	/* move two rows up */
+				sub esi, eax
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10362    	/* check loop termination, proceed if required */
+				add esi, 2   	/* move to the next row in Src */
+				add edi, 2   	/* move to the next row in Dest */
+				dec              edx    	/* decrease loop counter ROWS */
+				jnz            L10360    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm4 \n\t"	/* copy NRightShift into MM4 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"movq    (%%edx), %%mm5 \n\t"	/* MM5 = {0,K2,K1,K0} */
+			"add          $8, %%edx \n\t"	/* second row              |K0 K1 K2 0| */
+			"movq    (%%edx), %%mm6 \n\t"	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			"add          $8, %%edx \n\t"	/* third row               |K6 K7 K8 0| */
+			"movq    (%%edx), %%mm7 \n\t"	/* MM7 = {0,K8,K7,K6} */
+			/* --- */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
+			"sub          $2, %%edx \n\t"	/* do not use first and last row */
+			/* --- */
+			".L10360:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"sub          $2, %%ecx \n\t"	/* do not use first and last column */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10362:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the image first row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes of the image second row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm3 \n\t"	/* load 8 bytes of the image third row */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack first 4 bytes into words */
+			"psrlw     %%mm4, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm4, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm4, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm5, %%mm1 \n\t"	/* multiply words first row  image*Kernel */
+			"pmullw    %%mm6, %%mm2 \n\t"	/* multiply words second row image*Kernel */
+			"pmullw    %%mm7, %%mm3 \n\t"	/* multiply words third row  image*Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the first and second rows */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 4 words of the third row and result */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"psrlq       $32, %%mm1 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm1, %%mm3 \n\t"	/* copy MM1 into MM3 */
+			"psrlq       $16, %%mm1 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 1 left and 1 right result words */
+			"packuswb  %%mm0, %%mm1 \n\t"	/* pack shift result with saturation */
+			"movd      %%mm1, %%ebx \n\t"	/* copy saturated result into EBX */
+			"mov      %%bl, (%%edi) \n\t"	/* copy a byte result into Dest */
+			/* -- */
+			"sub       %%eax, %%esi \n\t"	/* move two rows up */
+			"sub       %%eax, %%esi \n\t" "inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10362 \n\t"	/* check loop termination, proceed if required */
+			"add          $2, %%esi \n\t"	/* move to the next row in Src */
+			"add          $2, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10360 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel5x5ShiftRight: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >4.
+\param columns Number of columns in source/destination array. Must be >4.
+\param Kernel The 2D convolution kernel of size 5x5.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 5) || (rows < 5) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm5, ebx   	/* copy NRightShift into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 2   	/* 2 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				shl eax, 1   	/* EAX = columns * 2 */
+				add edi, eax   	/* 2 row offset from the top edge */
+				shr eax, 1   	/* EAX = columns */
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 4   	/* do not use first 2 and last 2 rows */
+				/* ---, */
+L10370:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 4   	/* do not use first 2 and last 2 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10372:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				movd mm1, eax   	/* save EAX in MM1 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 72   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10372    	/* check loop termination, proceed if required */
+				add esi, 4   	/* move to the next row in Src */
+				add edi, 4   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10370    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $2, %%edi \n\t"	/* 2 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"shl          $1, %%eax \n\t"	/* EAX = columns * 2 */
+			"add       %%eax, %%edi \n\t"	/* 2 row offset from the top edge */
+			"shr          $1, %%eax \n\t"	/* EAX = columns */
+			"mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $4, %%ebx \n\t"	/* do not use first 2 and last 2 rows */
+			/* --- */
+			".L10370:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $4, %%ecx \n\t"	/* do not use first 2 and last 2 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10372:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub         $72, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10372 \n\t"	/* check loop termination, proceed if required */
+			"add          $4, %%esi \n\t"	/* move to the next row in Src */
+			"add          $4, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10370 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel7x7ShiftRight: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >6.
+\param columns Number of columns in source/destination array. Must be >6.
+\param Kernel The 2D convolution kernel of size 7x7.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 7) || (rows < 7) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm5, ebx   	/* copy NRightShift into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 3   	/* 3 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 3 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 6   	/* do not use first 3 and last 3 rows */
+				/* ---, */
+L10380:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 6   	/* do not use first 3 and last 3 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10382:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				movd mm1, eax   	/* save EAX in MM1 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 104   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10382    	/* check loop termination, proceed if required */
+				add esi, 6   	/* move to the next row in Src */
+				add edi, 6   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10380    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $3, %%edi \n\t"	/* 3 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 3 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $6, %%ebx \n\t"	/* do not use first 3 and last 3 rows */
+			/* --- */
+			".L10380:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $6, %%ecx \n\t"	/* do not use first 3 and last 3 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10382:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $104, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10382 \n\t"	/* check loop termination, proceed if required */
+			"add          $6, %%esi \n\t"	/* move to the next row in Src */
+			"add          $6, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10380 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel9x9ShiftRight: Dij = saturation255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >8.
+\param columns Number of columns in source/destination array. Must be >8.
+\param Kernel The 2D convolution kernel of size 9x9.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 9) || (rows < 9) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm5, ebx   	/* copy NRightShift into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 4   	/* 4 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 4 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 8   	/* do not use first 4 and last 4 rows */
+				/* ---, */
+L10390:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 8   	/* do not use first 4 and last 4 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10392:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 8 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 9 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm3, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				movd mm1, eax   	/* save EAX in MM1 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 208   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10392    	/* check loop termination, proceed if required */
+				add esi, 8   	/* move to the next row in Src */
+				add edi, 8   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10390    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $4, %%edi \n\t"	/* 4 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 4 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $8, %%ebx \n\t"	/* do not use first 4 and last 4 rows */
+			/* --- */
+			".L10390:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $8, %%ecx \n\t"	/* do not use first 4 and last 4 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10392:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 8 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 9 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $208, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10392 \n\t"	/* check loop termination, proceed if required */
+			"add          $8, %%esi \n\t"	/* move to the next row in Src */
+			"add          $8, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10390 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Filter using SobelX: Dij = saturation255( ... ) 
+
+\param Src The source 2D byte array to sobel-filter. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL))
+		return(-1);
+
+	if ((columns < 8) || (rows < 3))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				mov eax, columns   	/* load columns into EAX */
+				/* ---, */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				mov edx, rows   	/* initialize ROWS counter */
+				sub edx, 2   	/* do not use first and last rows */
+				/* ---, */
+L10400:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				shr ecx, 3   	/* EBX/8 (MMX loads 8 bytes at a time) */
+				mov ebx, esi   	/* save ESI in EBX */
+				movd mm1, edi   	/* save EDI in MM1 */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10402:
+			/* ---, */
+			movq mm4, [esi]   	/* load 8 bytes from Src */
+			movq mm5, mm4   	/* save MM4 in MM5 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm4, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm5, mm0   	/* unpack 4 high bytes into words */
+				movq mm6, [esi]   	/* load 8 bytes from Src */
+			movq mm7, mm6   	/* save MM6 in MM7 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm6, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm7, mm0   	/* unpack 4 high bytes into words */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				/* ---, */
+				movq mm2, mm4   	/* copy MM4 into MM2 */
+				psrlq mm4, 32   	/* shift 2 left words to the right */
+				psubw mm4, mm2   	/* MM4 = MM4 - MM2 */
+				movq mm3, mm6   	/* copy MM6 into MM3 */
+				psrlq mm6, 32   	/* shift 2 left words to the right */
+				psubw mm6, mm3   	/* MM6 = MM6 - MM3 */
+				punpckldq mm4, mm6   	/* combine 2 words of MM6 and 2 words of MM4 */
+				movq mm2, mm5   	/* copy MM6 into MM2 */
+				psrlq mm5, 32   	/* shift 2 left words to the right */
+				psubw mm5, mm2   	/* MM5 = MM5 - MM2 */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				psubw mm7, mm3   	/* MM7 = MM7 - MM3 */
+				punpckldq mm5, mm7   	/* combine 2 words of MM7 and 2 words of MM5 */
+				/* Take abs values of MM4 and MM5 */
+				movq mm6, mm4   	/* copy MM4 into MM6 */
+				movq mm7, mm5   	/* copy MM5 into MM7 */
+				psraw mm6, 15   	/* fill MM6 words with word sign bit */
+				psraw mm7, 15   	/* fill MM7 words with word sign bit */
+				pxor mm4, mm6   	/* take 1's compliment of only neg words */
+				pxor mm5, mm7   	/* take 1's compliment of only neg words */
+				psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+				psubsw mm5, mm7   	/* add 1 to only neg words, W-(-1) or W-0 */
+				packuswb mm4, mm5   	/* combine and pack/saturate MM5 and MM4 */
+				movq [edi], mm4   	/* store result in Dest */
+				/* ---, */
+				sub esi, eax   	/* move to the current top row in Src */
+				sub esi, eax
+				add esi, 8   	/* move Src  pointer to the next 8 pixels */
+				add edi, 8   	/* move Dest pointer to the next 8 pixels */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10402    	/* check loop termination, proceed if required */
+				mov esi, ebx   	/* restore most left current row Src  address */
+				movd edi, mm1   	/* restore most left current row Dest address */
+				add esi, eax   	/* move to the next row in Src */
+				add edi, eax   	/* move to the next row in Dest */
+				dec              edx    	/* decrease loop counter ROWS */
+				jnz            L10400    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			/* --- */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
+			"sub          $2, %%edx \n\t"	/* do not use first and last rows */
+			/* --- */
+			".L10400:                \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"shr          $3, %%ecx \n\t"	/* EBX/8 (MMX loads 8 bytes at a time) */
+			"mov       %%esi, %%ebx \n\t"	/* save ESI in EBX */
+			"movd      %%edi, %%mm1 \n\t"	/* save EDI in MM1 */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10402:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm4 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm4, %%mm5 \n\t"	/* save MM4 in MM5 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm4 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm5 \n\t"	/* unpack 4 high bytes into words */
+			"movq    (%%esi), %%mm6 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm6, %%mm7 \n\t"	/* save MM6 in MM7 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm6 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm7 \n\t"	/* unpack 4 high bytes into words */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			/* --- */
+			"movq      %%mm4, %%mm2 \n\t"	/* copy MM4 into MM2 */
+			"psrlq       $32, %%mm4 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm4 \n\t"	/* MM4 = MM4 - MM2 */
+			"movq      %%mm6, %%mm3 \n\t"	/* copy MM6 into MM3 */
+			"psrlq       $32, %%mm6 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm6 \n\t"	/* MM6 = MM6 - MM3 */
+			"punpckldq %%mm6, %%mm4 \n\t"	/* combine 2 words of MM6 and 2 words of MM4 */
+			"movq      %%mm5, %%mm2 \n\t"	/* copy MM6 into MM2 */
+			"psrlq       $32, %%mm5 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm5 \n\t"	/* MM5 = MM5 - MM2 */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm7 \n\t"	/* MM7 = MM7 - MM3 */
+			"punpckldq %%mm7, %%mm5 \n\t"	/* combine 2 words of MM7 and 2 words of MM5 */
+			/* Take abs values of MM4 and MM5 */
+			"movq      %%mm4, %%mm6 \n\t"	/* copy MM4 into MM6 */
+			"movq      %%mm5, %%mm7 \n\t"	/* copy MM5 into MM7 */
+			"psraw       $15, %%mm6 \n\t"	/* fill MM6 words with word sign bit */
+			"psraw       $15, %%mm7 \n\t"	/* fill MM7 words with word sign bit */
+			"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
+			"pxor      %%mm7, %%mm5 \n\t"	/* take 1's compliment of only neg. words */
+			"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"psubsw    %%mm7, %%mm5 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"packuswb  %%mm5, %%mm4 \n\t"	/* combine and pack/saturate MM5 and MM4 */
+			"movq    %%mm4, (%%edi) \n\t"	/* store result in Dest */
+			/* --- */
+			"sub       %%eax, %%esi \n\t"	/* move to the current top row in Src */
+			"sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"	/* move Src  pointer to the next 8 pixels */
+			"add $8,          %%edi \n\t"	/* move Dest pointer to the next 8 pixels */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10402 \n\t"	/* check loop termination, proceed if required */
+			"mov       %%ebx, %%esi \n\t"	/* restore most left current row Src  address */
+			"movd      %%mm1, %%edi \n\t"	/* restore most left current row Dest address */
+			"add       %%eax, %%esi \n\t"	/* move to the next row in Src */
+			"add       %%eax, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10400 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns)		/* %3 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using SobelXShiftRight: Dij = saturation255( ... ) 
+
+\param Src The source 2D byte array to sobel-filter. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >8.
+\param NRightShift The number of right bit shifts to apply to the filter sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+									unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL))
+		return(-1);
+	if ((columns < 8) || (rows < 3) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				mov eax, columns   	/* load columns into EAX */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm1, ebx   	/* copy NRightShift into MM1 */
+				/* ---, */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				/* initialize ROWS counter */
+				sub rows, 2   	/* do not use first and last rows */
+				/* ---, */
+L10410:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				shr ecx, 3   	/* EBX/8 (MMX loads 8 bytes at a time) */
+				mov ebx, esi   	/* save ESI in EBX */
+				mov edx, edi   	/* save EDI in EDX */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10412:
+			/* ---, */
+			movq mm4, [esi]   	/* load 8 bytes from Src */
+			movq mm5, mm4   	/* save MM4 in MM5 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm4, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm5, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm4, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm5, mm1   	/* shift right each pixel NshiftRight times */
+				movq mm6, [esi]   	/* load 8 bytes from Src */
+			movq mm7, mm6   	/* save MM6 in MM7 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm6, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm7, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm6, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm7, mm1   	/* shift right each pixel NshiftRight times */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				/* ---, */
+				movq mm2, mm4   	/* copy MM4 into MM2 */
+				psrlq mm4, 32   	/* shift 2 left words to the right */
+				psubw mm4, mm2   	/* MM4 = MM4 - MM2 */
+				movq mm3, mm6   	/* copy MM6 into MM3 */
+				psrlq mm6, 32   	/* shift 2 left words to the right */
+				psubw mm6, mm3   	/* MM6 = MM6 - MM3 */
+				punpckldq mm4, mm6   	/* combine 2 words of MM6 and 2 words of MM4 */
+				movq mm2, mm5   	/* copy MM6 into MM2 */
+				psrlq mm5, 32   	/* shift 2 left words to the right */
+				psubw mm5, mm2   	/* MM5 = MM5 - MM2 */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				psubw mm7, mm3   	/* MM7 = MM7 - MM3 */
+				punpckldq mm5, mm7   	/* combine 2 words of MM7 and 2 words of MM5 */
+				/* Take abs values of MM4 and MM5 */
+				movq mm6, mm4   	/* copy MM4 into MM6 */
+				movq mm7, mm5   	/* copy MM5 into MM7 */
+				psraw mm6, 15   	/* fill MM6 words with word sign bit */
+				psraw mm7, 15   	/* fill MM7 words with word sign bit */
+				pxor mm4, mm6   	/* take 1's compliment of only neg words */
+				pxor mm5, mm7   	/* take 1's compliment of only neg words */
+				psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+				psubsw mm5, mm7   	/* add 1 to only neg words, W-(-1) or W-0 */
+				packuswb mm4, mm5   	/* combine and pack/saturate MM5 and MM4 */
+				movq [edi], mm4   	/* store result in Dest */
+				/* ---, */
+				sub esi, eax   	/* move to the current top row in Src */
+				sub esi, eax
+				add esi, 8   	/* move Src  pointer to the next 8 pixels */
+				add edi, 8   	/* move Dest pointer to the next 8 pixels */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10412    	/* check loop termination, proceed if required */
+				mov esi, ebx   	/* restore most left current row Src  address */
+				mov edi, edx   	/* restore most left current row Dest address */
+				add esi, eax   	/* move to the next row in Src */
+				add edi, eax   	/* move to the next row in Dest */
+				dec rows    	/* decrease loop counter ROWS */
+				jnz            L10410    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %4, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm1 \n\t"	/* copy NRightShift into MM1 */
+			/* --- */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			/* initialize ROWS counter */
+			"subl            $2, %2 \n\t"	/* do not use first and last rows */
+			/* --- */
+			".L10410:                \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"shr          $3, %%ecx \n\t"	/* EBX/8 (MMX loads 8 bytes at a time) */
+			"mov       %%esi, %%ebx \n\t"	/* save ESI in EBX */
+			"mov       %%edi, %%edx \n\t"	/* save EDI in EDX */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10412:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm4 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm4, %%mm5 \n\t"	/* save MM4 in MM5 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm4 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm5 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm4 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm5 \n\t"	/* shift right each pixel NshiftRight times */
+			"movq    (%%esi), %%mm6 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm6, %%mm7 \n\t"	/* save MM6 in MM7 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm6 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm7 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm6 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm7 \n\t"	/* shift right each pixel NshiftRight times */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			/* --- */
+			"movq      %%mm4, %%mm2 \n\t"	/* copy MM4 into MM2 */
+			"psrlq       $32, %%mm4 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm4 \n\t"	/* MM4 = MM4 - MM2 */
+			"movq      %%mm6, %%mm3 \n\t"	/* copy MM6 into MM3 */
+			"psrlq       $32, %%mm6 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm6 \n\t"	/* MM6 = MM6 - MM3 */
+			"punpckldq %%mm6, %%mm4 \n\t"	/* combine 2 words of MM6 and 2 words of MM4 */
+			"movq      %%mm5, %%mm2 \n\t"	/* copy MM6 into MM2 */
+			"psrlq       $32, %%mm5 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm5 \n\t"	/* MM5 = MM5 - MM2 */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm7 \n\t"	/* MM7 = MM7 - MM3 */
+			"punpckldq %%mm7, %%mm5 \n\t"	/* combine 2 words of MM7 and 2 words of MM5 */
+			/* Take abs values of MM4 and MM5 */
+			"movq      %%mm4, %%mm6 \n\t"	/* copy MM4 into MM6 */
+			"movq      %%mm5, %%mm7 \n\t"	/* copy MM5 into MM7 */
+			"psraw       $15, %%mm6 \n\t"	/* fill MM6 words with word sign bit */
+			"psraw       $15, %%mm7 \n\t"	/* fill MM7 words with word sign bit */
+			"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
+			"pxor      %%mm7, %%mm5 \n\t"	/* take 1's compliment of only neg. words */
+			"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"psubsw    %%mm7, %%mm5 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"packuswb  %%mm5, %%mm4 \n\t"	/* combine and pack/saturate MM5 and MM4 */
+			"movq    %%mm4, (%%edi) \n\t"	/* store result in Dest */
+			/* --- */
+			"sub       %%eax, %%esi \n\t"	/* move to the current top row in Src */
+			"sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"	/* move Src  pointer to the next 8 pixels */
+			"add $8,          %%edi \n\t"	/* move Dest pointer to the next 8 pixels */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10412 \n\t"	/* check loop termination, proceed if required */
+			"mov       %%ebx, %%esi \n\t"	/* restore most left current row Src  address */
+			"mov       %%edx, %%edi \n\t"	/* restore most left current row Dest address */
+			"add       %%eax, %%esi \n\t"	/* move to the next row in Src */
+			"add       %%eax, %%edi \n\t"	/* move to the next row in Dest */
+			"decl                %2 \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10410 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(NRightShift)	/* %4 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Align stack to 32 byte boundary,
+*/
+void SDL_imageFilterAlignStack(void)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{				/* --- stack alignment --- */
+		mov ebx, esp   	/* load ESP into EBX */
+			sub ebx, 4   	/* reserve space on stack for old value of ESP */
+			and ebx, -32   	/* align EBX along a 32 byte boundary */
+			mov [ebx], esp   	/* save old value of ESP in stack, behind the bndry */
+			mov esp, ebx   	/* align ESP along a 32 byte boundary */
+	}
+#else
+	asm volatile
+		(				/* --- stack alignment --- */
+		"mov       %%esp, %%ebx \n\t"	/* load ESP into EBX */
+		"sub          $4, %%ebx \n\t"	/* reserve space on stack for old value of ESP */
+		"and        $-32, %%ebx \n\t"	/* align EBX along a 32 byte boundary */
+		"mov     %%esp, (%%ebx) \n\t"	/* save old value of ESP in stack, behind the bndry */
+		"mov       %%ebx, %%esp \n\t"	/* align ESP along a 32 byte boundary */
+		::);
+#endif
+#endif
+}
+
+/*!
+\brief Restore previously aligned stack.
+*/
+void SDL_imageFilterRestoreStack(void)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{				/* --- restoring old stack --- */
+		mov ebx, [esp]   	/* load old value of ESP */
+		mov esp, ebx   	/* restore old value of ESP */
+	}
+#else
+	asm volatile
+		(				/* --- restoring old stack --- */
+		"mov     (%%esp), %%ebx \n\t"	/* load old value of ESP */
+		"mov       %%ebx, %%esp \n\t"	/* restore old value of ESP */
+		::);
+#endif
+#endif
+}
diff --git a/lib/sdl2_gfx/src/SDL2_rotozoom.c b/lib/sdl2_gfx/src/SDL2_rotozoom.c
new file mode 100644
index 0000000..1a6ba15
--- /dev/null
+++ b/lib/sdl2_gfx/src/SDL2_rotozoom.c
@@ -0,0 +1,1663 @@
+/*  
+
+SDL2_rotozoom.c: rotozoomer, zoomer and shrinker for 32bit or 8bit surfaces
+
+Copyright (C) 2012-2014  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifdef WIN32
+#include <windows.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "SDL2_rotozoom.h"
+
+/* ---- Internally used structures */
+
+/*!
+\brief A 32 bit RGBA pixel.
+*/
+typedef struct tColorRGBA {
+	Uint8 r;
+	Uint8 g;
+	Uint8 b;
+	Uint8 a;
+} tColorRGBA;
+
+/*!
+\brief A 8bit Y/palette pixel.
+*/
+typedef struct tColorY {
+	Uint8 y;
+} tColorY;
+
+/*! 
+\brief Returns maximum of two numbers a and b.
+*/
+#define MAX(a,b)    (((a) > (b)) ? (a) : (b))
+
+/*! 
+\brief Number of guard rows added to destination surfaces.
+
+This is a simple but effective workaround for observed issues.
+These rows allocate extra memory and are then hidden from the surface.
+Rows are added to the end of destination surfaces when they are allocated. 
+This catches any potential overflows which seem to happen with 
+just the right src image dimensions and scale/rotation and can lead
+to a situation where the program can segfault.
+*/
+#define GUARD_ROWS (2)
+
+/*!
+\brief Lower limit of absolute zoom factor or rotation degrees.
+*/
+#define VALUE_LIMIT	0.001
+
+/*!
+\brief Returns colorkey info for a surface
+*/
+Uint32 _colorkey(SDL_Surface *src)
+{
+	Uint32 key = 0; 
+	SDL_GetColorKey(src, &key);
+	return key;
+}
+
+
+/*! 
+\brief Internal 32 bit integer-factor averaging Shrinker.
+
+Shrinks 32 bit RGBA/ABGR 'src' surface to 'dst' surface.
+Averages color and alpha values values of src pixels to calculate dst pixels.
+Assumes src and dst surfaces are of 32 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to shrink (input).
+\param dst The shrunken surface (output).
+\param factorx The horizontal shrinking ratio.
+\param factory The vertical shrinking ratio.
+
+\return 0 for success or -1 for error.
+*/
+int _shrinkSurfaceRGBA(SDL_Surface * src, SDL_Surface * dst, int factorx, int factory)
+{
+	int x, y, dx, dy, dgap, ra, ga, ba, aa;
+	int n_average;
+	tColorRGBA *sp, *osp, *oosp;
+	tColorRGBA *dp;
+
+	/*
+	* Averaging integer shrink
+	*/
+
+	/* Precalculate division factor */
+	n_average = factorx*factory;
+
+	/*
+	* Scan destination
+	*/
+	sp = (tColorRGBA *) src->pixels;
+	
+	dp = (tColorRGBA *) dst->pixels;
+	dgap = dst->pitch - dst->w * 4;
+
+	for (y = 0; y < dst->h; y++) {
+
+		osp=sp;
+		for (x = 0; x < dst->w; x++) {
+
+			/* Trace out source box and accumulate */
+			oosp=sp;
+			ra=ga=ba=aa=0;
+			for (dy=0; dy < factory; dy++) {
+				for (dx=0; dx < factorx; dx++) {
+					ra += sp->r;
+					ga += sp->g;
+					ba += sp->b;
+					aa += sp->a;
+
+					sp++;
+				} 
+				/* src dx loop */
+				sp = (tColorRGBA *)((Uint8*)sp + (src->pitch - 4*factorx)); // next y
+			}
+			/* src dy loop */
+
+			/* next box-x */
+			sp = (tColorRGBA *)((Uint8*)oosp + 4*factorx);
+
+			/* Store result in destination */
+			dp->r = ra/n_average;
+			dp->g = ga/n_average;
+			dp->b = ba/n_average;
+			dp->a = aa/n_average;
+
+			/*
+			* Advance destination pointer 
+			*/
+			dp++;
+		} 
+		/* dst x loop */
+
+		/* next box-y */
+		sp = (tColorRGBA *)((Uint8*)osp + src->pitch*factory);
+
+		/*
+		* Advance destination pointers 
+		*/
+		dp = (tColorRGBA *) ((Uint8 *) dp + dgap);
+	} 
+	/* dst y loop */
+
+	return (0);
+}
+
+/*! 
+\brief Internal 8 bit integer-factor averaging shrinker.
+
+Shrinks 8bit Y 'src' surface to 'dst' surface.
+Averages color (brightness) values values of src pixels to calculate dst pixels.
+Assumes src and dst surfaces are of 8 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to shrink (input).
+\param dst The shrunken surface (output).
+\param factorx The horizontal shrinking ratio.
+\param factory The vertical shrinking ratio.
+
+\return 0 for success or -1 for error.
+*/
+int _shrinkSurfaceY(SDL_Surface * src, SDL_Surface * dst, int factorx, int factory)
+{
+	int x, y, dx, dy, dgap, a;
+	int n_average;
+	Uint8 *sp, *osp, *oosp;
+	Uint8 *dp;
+
+	/*
+	* Averaging integer shrink
+	*/
+
+	/* Precalculate division factor */
+	n_average = factorx*factory;
+
+	/*
+	* Scan destination
+	*/
+	sp = (Uint8 *) src->pixels;
+
+	dp = (Uint8 *) dst->pixels;
+	dgap = dst->pitch - dst->w;
+
+	for (y = 0; y < dst->h; y++) {    
+
+		osp=sp;
+		for (x = 0; x < dst->w; x++) {
+
+			/* Trace out source box and accumulate */
+			oosp=sp;
+			a=0;
+			for (dy=0; dy < factory; dy++) {
+				for (dx=0; dx < factorx; dx++) {
+					a += (*sp);
+					/* next x */           
+					sp++;
+				} 
+				/* end src dx loop */         
+				/* next y */
+				sp = (Uint8 *)((Uint8*)sp + (src->pitch - factorx)); 
+			} 
+			/* end src dy loop */
+
+			/* next box-x */
+			sp = (Uint8 *)((Uint8*)oosp + factorx);
+
+			/* Store result in destination */
+			*dp = a/n_average;
+
+			/*
+			* Advance destination pointer 
+			*/
+			dp++;
+		} 
+		/* end dst x loop */
+
+		/* next box-y */
+		sp = (Uint8 *)((Uint8*)osp + src->pitch*factory);
+
+		/*
+		* Advance destination pointers 
+		*/
+		dp = (Uint8 *)((Uint8 *)dp + dgap);
+	} 
+	/* end dst y loop */
+
+	return (0);
+}
+
+/*! 
+\brief Internal 32 bit Zoomer with optional anti-aliasing by bilinear interpolation.
+
+Zooms 32 bit RGBA/ABGR 'src' surface to 'dst' surface.
+Assumes src and dst surfaces are of 32 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to zoom (input).
+\param dst The zoomed surface (output).
+\param flipx Flag indicating if the image should be horizontally flipped.
+\param flipy Flag indicating if the image should be vertically flipped.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return 0 for success or -1 for error.
+*/
+int _zoomSurfaceRGBA(SDL_Surface * src, SDL_Surface * dst, int flipx, int flipy, int smooth)
+{
+	int x, y, sx, sy, ssx, ssy, *sax, *say, *csax, *csay, *salast, csx, csy, ex, ey, cx, cy, sstep, sstepx, sstepy;
+	tColorRGBA *c00, *c01, *c10, *c11;
+	tColorRGBA *sp, *csp, *dp;
+	int spixelgap, spixelw, spixelh, dgap, t1, t2;
+
+	/*
+	* Allocate memory for row/column increments 
+	*/
+	if ((sax = (int *) malloc((dst->w + 1) * sizeof(Uint32))) == NULL) {
+		return (-1);
+	}
+	if ((say = (int *) malloc((dst->h + 1) * sizeof(Uint32))) == NULL) {
+		free(sax);
+		return (-1);
+	}
+
+	/*
+	* Precalculate row increments 
+	*/
+	spixelw = (src->w - 1);
+	spixelh = (src->h - 1);
+	if (smooth) {
+		sx = (int) (65536.0 * (float) spixelw / (float) (dst->w - 1));
+		sy = (int) (65536.0 * (float) spixelh / (float) (dst->h - 1));
+	} else {
+		sx = (int) (65536.0 * (float) (src->w) / (float) (dst->w));
+		sy = (int) (65536.0 * (float) (src->h) / (float) (dst->h));
+	}
+
+	/* Maximum scaled source size */
+	ssx = (src->w << 16) - 1;
+	ssy = (src->h << 16) - 1;
+
+	/* Precalculate horizontal row increments */
+	csx = 0;
+	csax = sax;
+	for (x = 0; x <= dst->w; x++) {
+		*csax = csx;
+		csax++;
+		csx += sx;
+
+		/* Guard from overflows */
+		if (csx > ssx) { 
+			csx = ssx; 
+		}
+	}
+
+	/* Precalculate vertical row increments */
+	csy = 0;
+	csay = say;
+	for (y = 0; y <= dst->h; y++) {
+		*csay = csy;
+		csay++;
+		csy += sy;
+
+		/* Guard from overflows */
+		if (csy > ssy) {
+			csy = ssy;
+		}
+	}
+
+	sp = (tColorRGBA *) src->pixels;
+	dp = (tColorRGBA *) dst->pixels;
+	dgap = dst->pitch - dst->w * 4;
+	spixelgap = src->pitch/4;
+
+	if (flipx) sp += spixelw;
+	if (flipy) sp += (spixelgap * spixelh);
+
+	/*
+	* Switch between interpolating and non-interpolating code 
+	*/
+	if (smooth) {
+
+		/*
+		* Interpolating Zoom 
+		*/
+		csay = say;
+		for (y = 0; y < dst->h; y++) {
+			csp = sp;
+			csax = sax;
+			for (x = 0; x < dst->w; x++) {
+				/*
+				* Setup color source pointers 
+				*/
+				ex = (*csax & 0xffff);
+				ey = (*csay & 0xffff);
+				cx = (*csax >> 16);
+				cy = (*csay >> 16);
+				sstepx = cx < spixelw;
+				sstepy = cy < spixelh;
+				c00 = sp;
+				c01 = sp;
+				c10 = sp;
+				if (sstepy) {
+					if (flipy) {
+						c10 -= spixelgap;
+					} else {
+						c10 += spixelgap;
+					}
+				}
+				c11 = c10;
+				if (sstepx) {
+					if (flipx) {
+						c01--;
+						c11--;
+					} else {
+						c01++;
+						c11++;
+					}
+				}
+
+				/*
+				* Draw and interpolate colors 
+				*/
+				t1 = ((((c01->r - c00->r) * ex) >> 16) + c00->r) & 0xff;
+				t2 = ((((c11->r - c10->r) * ex) >> 16) + c10->r) & 0xff;
+				dp->r = (((t2 - t1) * ey) >> 16) + t1;
+				t1 = ((((c01->g - c00->g) * ex) >> 16) + c00->g) & 0xff;
+				t2 = ((((c11->g - c10->g) * ex) >> 16) + c10->g) & 0xff;
+				dp->g = (((t2 - t1) * ey) >> 16) + t1;
+				t1 = ((((c01->b - c00->b) * ex) >> 16) + c00->b) & 0xff;
+				t2 = ((((c11->b - c10->b) * ex) >> 16) + c10->b) & 0xff;
+				dp->b = (((t2 - t1) * ey) >> 16) + t1;
+				t1 = ((((c01->a - c00->a) * ex) >> 16) + c00->a) & 0xff;
+				t2 = ((((c11->a - c10->a) * ex) >> 16) + c10->a) & 0xff;
+				dp->a = (((t2 - t1) * ey) >> 16) + t1;				
+				/*
+				* Advance source pointer x
+				*/
+				salast = csax;
+				csax++;				
+				sstep = (*csax >> 16) - (*salast >> 16);
+				if (flipx) {
+					sp -= sstep;
+				} else {
+					sp += sstep;
+				}
+
+				/*
+				* Advance destination pointer x
+				*/
+				dp++;
+			}
+			/*
+			* Advance source pointer y
+			*/
+			salast = csay;
+			csay++;
+			sstep = (*csay >> 16) - (*salast >> 16);
+			sstep *= spixelgap;
+			if (flipy) { 
+				sp = csp - sstep;
+			} else {
+				sp = csp + sstep;
+			}
+
+			/*
+			* Advance destination pointer y
+			*/
+			dp = (tColorRGBA *) ((Uint8 *) dp + dgap);
+		}
+	} else {
+		/*
+		* Non-Interpolating Zoom 
+		*/		
+		csay = say;
+		for (y = 0; y < dst->h; y++) {
+			csp = sp;
+			csax = sax;
+			for (x = 0; x < dst->w; x++) {
+				/*
+				* Draw 
+				*/
+				*dp = *sp;
+
+				/*
+				* Advance source pointer x
+				*/
+				salast = csax;
+				csax++;				
+				sstep = (*csax >> 16) - (*salast >> 16);
+				if (flipx) sstep = -sstep;
+				sp += sstep;
+
+				/*
+				* Advance destination pointer x
+				*/
+				dp++;
+			}
+			/*
+			* Advance source pointer y
+			*/
+			salast = csay;
+			csay++;
+			sstep = (*csay >> 16) - (*salast >> 16);
+			sstep *= spixelgap;
+			if (flipy) sstep = -sstep;			
+			sp = csp + sstep;
+
+			/*
+			* Advance destination pointer y
+			*/
+			dp = (tColorRGBA *) ((Uint8 *) dp + dgap);
+		}
+	}
+
+	/*
+	* Remove temp arrays 
+	*/
+	free(sax);
+	free(say);
+
+	return (0);
+}
+
+/*! 
+
+\brief Internal 8 bit Zoomer without smoothing.
+
+Zooms 8bit palette/Y 'src' surface to 'dst' surface.
+Assumes src and dst surfaces are of 8 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to zoom (input).
+\param dst The zoomed surface (output).
+\param flipx Flag indicating if the image should be horizontally flipped.
+\param flipy Flag indicating if the image should be vertically flipped.
+
+\return 0 for success or -1 for error.
+*/
+int _zoomSurfaceY(SDL_Surface * src, SDL_Surface * dst, int flipx, int flipy)
+{
+	int x, y;
+	Uint32 *sax, *say, *csax, *csay;
+	int csx, csy;
+	Uint8 *sp, *dp, *csp;
+	int dgap;
+
+	/*
+	* Allocate memory for row increments 
+	*/
+	if ((sax = (Uint32 *) malloc((dst->w + 1) * sizeof(Uint32))) == NULL) {
+		return (-1);
+	}
+	if ((say = (Uint32 *) malloc((dst->h + 1) * sizeof(Uint32))) == NULL) {
+		free(sax);
+		return (-1);
+	}
+
+	/*
+	* Pointer setup 
+	*/
+	sp = csp = (Uint8 *) src->pixels;
+	dp = (Uint8 *) dst->pixels;
+	dgap = dst->pitch - dst->w;
+
+	if (flipx) csp += (src->w-1);
+	if (flipy) csp  = ( (Uint8*)csp + src->pitch*(src->h-1) );
+
+	/*
+	* Precalculate row increments 
+	*/
+	csx = 0;
+	csax = sax;
+	for (x = 0; x < dst->w; x++) {
+		csx += src->w;
+		*csax = 0;
+		while (csx >= dst->w) {
+			csx -= dst->w;
+			(*csax)++;
+		}
+		(*csax) = (*csax) * (flipx ? -1 : 1);
+		csax++;
+	}
+	csy = 0;
+	csay = say;
+	for (y = 0; y < dst->h; y++) {
+		csy += src->h;
+		*csay = 0;
+		while (csy >= dst->h) {
+			csy -= dst->h;
+			(*csay)++;
+		}
+		(*csay) = (*csay) * (flipy ? -1 : 1);
+		csay++;
+	}
+
+	/*
+	* Draw 
+	*/
+	csay = say;
+	for (y = 0; y < dst->h; y++) {
+		csax = sax;
+		sp = csp;
+		for (x = 0; x < dst->w; x++) {
+			/*
+			* Draw 
+			*/
+			*dp = *sp;
+			/*
+			* Advance source pointers 
+			*/
+			sp += (*csax);
+			csax++;
+			/*
+			* Advance destination pointer 
+			*/
+			dp++;
+		}
+		/*
+		* Advance source pointer (for row) 
+		*/
+		csp += ((*csay) * src->pitch);
+		csay++;
+
+		/*
+		* Advance destination pointers 
+		*/
+		dp += dgap;
+	}
+
+	/*
+	* Remove temp arrays 
+	*/
+	free(sax);
+	free(say);
+
+	return (0);
+}
+
+/*! 
+\brief Internal 32 bit rotozoomer with optional anti-aliasing.
+
+Rotates and zooms 32 bit RGBA/ABGR 'src' surface to 'dst' surface based on the control 
+parameters by scanning the destination surface and applying optionally anti-aliasing
+by bilinear interpolation.
+Assumes src and dst surfaces are of 32 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src Source surface.
+\param dst Destination surface.
+\param cx Horizontal center coordinate.
+\param cy Vertical center coordinate.
+\param isin Integer version of sine of angle.
+\param icos Integer version of cosine of angle.
+\param flipx Flag indicating horizontal mirroring should be applied.
+\param flipy Flag indicating vertical mirroring should be applied.
+\param smooth Flag indicating anti-aliasing should be used.
+*/
+void _transformSurfaceRGBA(SDL_Surface * src, SDL_Surface * dst, int cx, int cy, int isin, int icos, int flipx, int flipy, int smooth)
+{
+	int x, y, t1, t2, dx, dy, xd, yd, sdx, sdy, ax, ay, ex, ey, sw, sh;
+	tColorRGBA c00, c01, c10, c11, cswap;
+	tColorRGBA *pc, *sp;
+	int gap;
+
+	/*
+	* Variable setup 
+	*/
+	xd = ((src->w - dst->w) << 15);
+	yd = ((src->h - dst->h) << 15);
+	ax = (cx << 16) - (icos * cx);
+	ay = (cy << 16) - (isin * cx);
+	sw = src->w - 1;
+	sh = src->h - 1;
+	pc = (tColorRGBA*) dst->pixels;
+	gap = dst->pitch - dst->w * 4;
+
+	/*
+	* Switch between interpolating and non-interpolating code 
+	*/
+	if (smooth) {
+		for (y = 0; y < dst->h; y++) {
+			dy = cy - y;
+			sdx = (ax + (isin * dy)) + xd;
+			sdy = (ay - (icos * dy)) + yd;
+			for (x = 0; x < dst->w; x++) {
+				dx = (sdx >> 16);
+				dy = (sdy >> 16);
+				if (flipx) dx = sw - dx;
+				if (flipy) dy = sh - dy;
+				if ((dx > -1) && (dy > -1) && (dx < (src->w-1)) && (dy < (src->h-1))) {
+					sp = (tColorRGBA *)src->pixels;;
+					sp += ((src->pitch/4) * dy);
+					sp += dx;
+					c00 = *sp;
+					sp += 1;
+					c01 = *sp;
+					sp += (src->pitch/4);
+					c11 = *sp;
+					sp -= 1;
+					c10 = *sp;
+					if (flipx) {
+						cswap = c00; c00=c01; c01=cswap;
+						cswap = c10; c10=c11; c11=cswap;
+					}
+					if (flipy) {
+						cswap = c00; c00=c10; c10=cswap;
+						cswap = c01; c01=c11; c11=cswap;
+					}
+					/*
+					* Interpolate colors 
+					*/
+					ex = (sdx & 0xffff);
+					ey = (sdy & 0xffff);
+					t1 = ((((c01.r - c00.r) * ex) >> 16) + c00.r) & 0xff;
+					t2 = ((((c11.r - c10.r) * ex) >> 16) + c10.r) & 0xff;
+					pc->r = (((t2 - t1) * ey) >> 16) + t1;
+					t1 = ((((c01.g - c00.g) * ex) >> 16) + c00.g) & 0xff;
+					t2 = ((((c11.g - c10.g) * ex) >> 16) + c10.g) & 0xff;
+					pc->g = (((t2 - t1) * ey) >> 16) + t1;
+					t1 = ((((c01.b - c00.b) * ex) >> 16) + c00.b) & 0xff;
+					t2 = ((((c11.b - c10.b) * ex) >> 16) + c10.b) & 0xff;
+					pc->b = (((t2 - t1) * ey) >> 16) + t1;
+					t1 = ((((c01.a - c00.a) * ex) >> 16) + c00.a) & 0xff;
+					t2 = ((((c11.a - c10.a) * ex) >> 16) + c10.a) & 0xff;
+					pc->a = (((t2 - t1) * ey) >> 16) + t1;
+				}
+				sdx += icos;
+				sdy += isin;
+				pc++;
+			}
+			pc = (tColorRGBA *) ((Uint8 *) pc + gap);
+		}
+	} else {
+		for (y = 0; y < dst->h; y++) {
+			dy = cy - y;
+			sdx = (ax + (isin * dy)) + xd;
+			sdy = (ay - (icos * dy)) + yd;
+			for (x = 0; x < dst->w; x++) {
+				dx = (short) (sdx >> 16);
+				dy = (short) (sdy >> 16);
+				if (flipx) dx = (src->w-1)-dx;
+				if (flipy) dy = (src->h-1)-dy;
+				if ((dx >= 0) && (dy >= 0) && (dx < src->w) && (dy < src->h)) {
+					sp = (tColorRGBA *) ((Uint8 *) src->pixels + src->pitch * dy);
+					sp += dx;
+					*pc = *sp;
+				}
+				sdx += icos;
+				sdy += isin;
+				pc++;
+			}
+			pc = (tColorRGBA *) ((Uint8 *) pc + gap);
+		}
+	}
+}
+
+/*!
+
+\brief Rotates and zooms 8 bit palette/Y 'src' surface to 'dst' surface without smoothing.
+
+Rotates and zooms 8 bit RGBA/ABGR 'src' surface to 'dst' surface based on the control 
+parameters by scanning the destination surface.
+Assumes src and dst surfaces are of 8 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src Source surface.
+\param dst Destination surface.
+\param cx Horizontal center coordinate.
+\param cy Vertical center coordinate.
+\param isin Integer version of sine of angle.
+\param icos Integer version of cosine of angle.
+\param flipx Flag indicating horizontal mirroring should be applied.
+\param flipy Flag indicating vertical mirroring should be applied.
+*/
+void transformSurfaceY(SDL_Surface * src, SDL_Surface * dst, int cx, int cy, int isin, int icos, int flipx, int flipy)
+{
+	int x, y, dx, dy, xd, yd, sdx, sdy, ax, ay;
+	tColorY *pc, *sp;
+	int gap;
+
+	/*
+	* Variable setup 
+	*/
+	xd = ((src->w - dst->w) << 15);
+	yd = ((src->h - dst->h) << 15);
+	ax = (cx << 16) - (icos * cx);
+	ay = (cy << 16) - (isin * cx);
+	pc = (tColorY*) dst->pixels;
+	gap = dst->pitch - dst->w;
+	/*
+	* Clear surface to colorkey 
+	*/ 	
+	memset(pc, (int)(_colorkey(src) & 0xff), dst->pitch * dst->h);
+	/*
+	* Iterate through destination surface 
+	*/
+	for (y = 0; y < dst->h; y++) {
+		dy = cy - y;
+		sdx = (ax + (isin * dy)) + xd;
+		sdy = (ay - (icos * dy)) + yd;
+		for (x = 0; x < dst->w; x++) {
+			dx = (short) (sdx >> 16);
+			dy = (short) (sdy >> 16);
+			if (flipx) dx = (src->w-1)-dx;
+			if (flipy) dy = (src->h-1)-dy;
+			if ((dx >= 0) && (dy >= 0) && (dx < src->w) && (dy < src->h)) {
+				sp = (tColorY *) (src->pixels);
+				sp += (src->pitch * dy + dx);
+				*pc = *sp;
+			}
+			sdx += icos;
+			sdy += isin;
+			pc++;
+		}
+		pc += gap;
+	}
+}
+
+/*!
+\brief Rotates a 8/16/24/32 bit surface in increments of 90 degrees.
+
+Specialized 90 degree rotator which rotates a 'src' surface in 90 degree 
+increments clockwise returning a new surface. Faster than rotozoomer since
+no scanning or interpolation takes place. Input surface must be 8/16/24/32 bit.
+(code contributed by J. Schiller, improved by C. Allport and A. Schiffler)
+
+\param src Source surface to rotate.
+\param numClockwiseTurns Number of clockwise 90 degree turns to apply to the source.
+
+\returns The new, rotated surface; or NULL for surfaces with incorrect input format.
+*/
+SDL_Surface* rotateSurface90Degrees(SDL_Surface* src, int numClockwiseTurns) 
+{
+	int row, col, newWidth, newHeight;
+	int bpp, bpr;
+	SDL_Surface* dst;
+	Uint8* srcBuf;
+	Uint8* dstBuf;
+	int normalizedClockwiseTurns;
+
+	/* Has to be a valid surface pointer and be a Nbit surface where n is divisible by 8 */
+	if (!src || 
+	    !src->format) {
+		SDL_SetError("NULL source surface or source surface format");
+	    return NULL; 
+	}
+
+	if ((src->format->BitsPerPixel % 8) != 0) {
+		SDL_SetError("Invalid source surface bit depth");
+	    return NULL; 
+	}
+
+	/* normalize numClockwiseTurns */
+	normalizedClockwiseTurns = (numClockwiseTurns % 4);
+	if (normalizedClockwiseTurns < 0) {
+		normalizedClockwiseTurns += 4;
+	}
+
+	/* If turns are even, our new width/height will be the same as the source surface */
+	if (normalizedClockwiseTurns % 2) {
+		newWidth = src->h;
+		newHeight = src->w;
+	} else {
+		newWidth = src->w;
+		newHeight = src->h;
+	}
+
+	dst = SDL_CreateRGBSurface( src->flags, newWidth, newHeight, src->format->BitsPerPixel,
+		src->format->Rmask,
+		src->format->Gmask, 
+		src->format->Bmask, 
+		src->format->Amask);
+	if(!dst) {
+		SDL_SetError("Could not create destination surface"); 
+		return NULL;
+	}
+
+	if (SDL_MUSTLOCK(src)) {
+		SDL_LockSurface(src);
+	}
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_LockSurface(dst);
+	}
+
+	/* Calculate byte-per-pixel */
+	bpp = src->format->BitsPerPixel / 8;
+
+	switch(normalizedClockwiseTurns) {
+	case 0: /* Make a copy of the surface */
+		{
+			/* Unfortunately SDL_BlitSurface cannot be used to make a copy of the surface
+			since it does not preserve alpha. */
+
+			if (src->pitch == dst->pitch) {
+				/* If the pitch is the same for both surfaces, the memory can be copied all at once. */
+				memcpy(dst->pixels, src->pixels, (src->h * src->pitch));
+			}
+			else
+			{
+				/* If the pitch differs, copy each row separately */
+				srcBuf = (Uint8*)(src->pixels);
+				dstBuf = (Uint8*)(dst->pixels);
+				bpr = src->w * bpp;
+				for (row = 0; row < src->h; row++) {
+					memcpy(dstBuf, srcBuf, bpr);
+					srcBuf += src->pitch;
+					dstBuf += dst->pitch;
+				}
+			}
+		}
+		break;
+
+		/* rotate clockwise */
+	case 1: /* rotated 90 degrees clockwise */
+		{
+			for (row = 0; row < src->h; ++row) {
+				srcBuf = (Uint8*)(src->pixels) + (row * src->pitch);
+				dstBuf = (Uint8*)(dst->pixels) + (dst->w - row - 1) * bpp;
+				for (col = 0; col < src->w; ++col) {
+					memcpy (dstBuf, srcBuf, bpp);
+					srcBuf += bpp;
+					dstBuf += dst->pitch;
+				} 
+			} 
+		}
+		break;
+
+	case 2: /* rotated 180 degrees clockwise */
+		{
+			for (row = 0; row < src->h; ++row) {
+				srcBuf = (Uint8*)(src->pixels) + (row * src->pitch);
+				dstBuf = (Uint8*)(dst->pixels) + ((dst->h - row - 1) * dst->pitch) + (dst->w - 1) * bpp;
+				for (col = 0; col < src->w; ++col) {
+					memcpy (dstBuf, srcBuf, bpp);
+					srcBuf += bpp;
+					dstBuf -= bpp;
+				} 
+			} 
+		}
+		break;
+
+	case 3: /* rotated 270 degrees clockwise */
+		{
+			for (row = 0; row < src->h; ++row) {
+				srcBuf = (Uint8*)(src->pixels) + (row * src->pitch);
+				dstBuf = (Uint8*)(dst->pixels) + (row * bpp) + (dst->h * dst->pitch);
+				for (col = 0; col < src->w; ++col) {
+					memcpy (dstBuf, srcBuf, bpp);
+					srcBuf += bpp;
+					dstBuf -= dst->pitch;
+				} 
+			} 
+		}
+		break;
+	} 
+	/* end switch */
+
+	if (SDL_MUSTLOCK(src)) {
+		SDL_UnlockSurface(src);
+	}
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return dst;
+}
+
+
+/*!
+\brief Internal target surface sizing function for rotozooms with trig result return. 
+
+\param width The source surface width.
+\param height The source surface height.
+\param angle The angle to rotate in degrees.
+\param zoomx The horizontal scaling factor.
+\param zoomy The vertical scaling factor.
+\param dstwidth The calculated width of the destination surface.
+\param dstheight The calculated height of the destination surface.
+\param canglezoom The sine of the angle adjusted by the zoom factor.
+\param sanglezoom The cosine of the angle adjusted by the zoom factor.
+
+*/
+void _rotozoomSurfaceSizeTrig(int width, int height, double angle, double zoomx, double zoomy, 
+	int *dstwidth, int *dstheight, 
+	double *canglezoom, double *sanglezoom)
+{
+	double x, y, cx, cy, sx, sy;
+	double radangle;
+	int dstwidthhalf, dstheighthalf;
+
+	/*
+	* Determine destination width and height by rotating a centered source box 
+	*/
+	radangle = angle * (M_PI / 180.0);
+	*sanglezoom = sin(radangle);
+	*canglezoom = cos(radangle);
+	*sanglezoom *= zoomx;
+	*canglezoom *= zoomy;
+	x = (double)(width / 2);
+	y = (double)(height / 2);
+	cx = *canglezoom * x;
+	cy = *canglezoom * y;
+	sx = *sanglezoom * x;
+	sy = *sanglezoom * y;
+
+	dstwidthhalf = MAX((int)
+		ceil(MAX(MAX(MAX(fabs(cx + sy), fabs(cx - sy)), fabs(-cx + sy)), fabs(-cx - sy))), 1);
+	dstheighthalf = MAX((int)
+		ceil(MAX(MAX(MAX(fabs(sx + cy), fabs(sx - cy)), fabs(-sx + cy)), fabs(-sx - cy))), 1);
+	*dstwidth = 2 * dstwidthhalf;
+	*dstheight = 2 * dstheighthalf;
+}
+
+/*! 
+\brief Returns the size of the resulting target surface for a rotozoomSurfaceXY() call. 
+
+\param width The source surface width.
+\param height The source surface height.
+\param angle The angle to rotate in degrees.
+\param zoomx The horizontal scaling factor.
+\param zoomy The vertical scaling factor.
+\param dstwidth The calculated width of the rotozoomed destination surface.
+\param dstheight The calculated height of the rotozoomed destination surface.
+*/
+void rotozoomSurfaceSizeXY(int width, int height, double angle, double zoomx, double zoomy, int *dstwidth, int *dstheight)
+{
+	double dummy_sanglezoom, dummy_canglezoom;
+
+	_rotozoomSurfaceSizeTrig(width, height, angle, zoomx, zoomy, dstwidth, dstheight, &dummy_sanglezoom, &dummy_canglezoom);
+}
+
+/*! 
+\brief Returns the size of the resulting target surface for a rotozoomSurface() call. 
+
+\param width The source surface width.
+\param height The source surface height.
+\param angle The angle to rotate in degrees.
+\param zoom The scaling factor.
+\param dstwidth The calculated width of the rotozoomed destination surface.
+\param dstheight The calculated height of the rotozoomed destination surface.
+*/
+void rotozoomSurfaceSize(int width, int height, double angle, double zoom, int *dstwidth, int *dstheight)
+{
+	double dummy_sanglezoom, dummy_canglezoom;
+
+	_rotozoomSurfaceSizeTrig(width, height, angle, zoom, zoom, dstwidth, dstheight, &dummy_sanglezoom, &dummy_canglezoom);
+}
+
+/*!
+\brief Rotates and zooms a surface and optional anti-aliasing. 
+
+Rotates and zoomes a 32bit or 8bit 'src' surface to newly created 'dst' surface.
+'angle' is the rotation in degrees and 'zoom' a scaling factor. If 'smooth' is set
+then the destination 32bit surface is anti-aliased. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+
+\param src The surface to rotozoom.
+\param angle The angle to rotate in degrees.
+\param zoom The scaling factor.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return The new rotozoomed surface.
+*/
+SDL_Surface *rotozoomSurface(SDL_Surface * src, double angle, double zoom, int smooth)
+{
+	return rotozoomSurfaceXY(src, angle, zoom, zoom, smooth);
+}
+
+/*!
+\brief Rotates and zooms a surface with different horizontal and vertival scaling factors and optional anti-aliasing. 
+
+Rotates and zooms a 32bit or 8bit 'src' surface to newly created 'dst' surface.
+'angle' is the rotation in degrees, 'zoomx and 'zoomy' scaling factors. If 'smooth' is set
+then the destination 32bit surface is anti-aliased. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+
+\param src The surface to rotozoom.
+\param angle The angle to rotate in degrees.
+\param zoomx The horizontal scaling factor.
+\param zoomy The vertical scaling factor.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return The new rotozoomed surface.
+*/
+SDL_Surface *rotozoomSurfaceXY(SDL_Surface * src, double angle, double zoomx, double zoomy, int smooth)
+{
+	SDL_Surface *rz_src;
+	SDL_Surface *rz_dst;
+	double zoominv;
+	double sanglezoom, canglezoom, sanglezoominv, canglezoominv;
+	int dstwidthhalf, dstwidth, dstheighthalf, dstheight;
+	int is32bit;
+	int i, src_converted;
+	int flipx,flipy;
+
+	/*
+	* Sanity check 
+	*/
+	if (src == NULL) {
+		return (NULL);
+	}
+
+	/*
+	* Determine if source surface is 32bit or 8bit 
+	*/
+	is32bit = (src->format->BitsPerPixel == 32);
+	if ((is32bit) || (src->format->BitsPerPixel == 8)) {
+		/*
+		* Use source surface 'as is' 
+		*/
+		rz_src = src;
+		src_converted = 0;
+	} else {
+		/*
+		* New source surface is 32bit with a defined RGBA ordering 
+		*/
+		rz_src =
+			SDL_CreateRGBSurface(SDL_SWSURFACE, src->w, src->h, 32, 
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+			0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000
+#else
+			0xff000000,  0x00ff0000, 0x0000ff00, 0x000000ff
+#endif
+			);
+
+		SDL_BlitSurface(src, NULL, rz_src, NULL);
+
+		src_converted = 1;
+		is32bit = 1;
+	}
+
+	/*
+	* Sanity check zoom factor 
+	*/
+	flipx = (zoomx<0.0);
+	if (flipx) zoomx=-zoomx;
+	flipy = (zoomy<0.0);
+	if (flipy) zoomy=-zoomy;
+	if (zoomx < VALUE_LIMIT) zoomx = VALUE_LIMIT;
+	if (zoomy < VALUE_LIMIT) zoomy = VALUE_LIMIT;
+	zoominv = 65536.0 / (zoomx * zoomx);
+
+	/*
+	* Check if we have a rotozoom or just a zoom 
+	*/
+	if (fabs(angle) > VALUE_LIMIT) {
+
+		/*
+		* Angle!=0: full rotozoom 
+		*/
+		/*
+		* ----------------------- 
+		*/
+
+		/* Determine target size */
+		_rotozoomSurfaceSizeTrig(rz_src->w, rz_src->h, angle, zoomx, zoomy, &dstwidth, &dstheight, &canglezoom, &sanglezoom);
+
+		/*
+		* Calculate target factors from sin/cos and zoom 
+		*/
+		sanglezoominv = sanglezoom;
+		canglezoominv = canglezoom;
+		sanglezoominv *= zoominv;
+		canglezoominv *= zoominv;
+
+		/* Calculate half size */
+		dstwidthhalf = dstwidth / 2;
+		dstheighthalf = dstheight / 2;
+
+		/*
+		* Alloc space to completely contain the rotated surface 
+		*/
+		rz_dst = NULL;
+		if (is32bit) {
+			/*
+			* Target surface is 32bit with source RGBA/ABGR ordering 
+			*/
+			rz_dst =
+				SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 32,
+				rz_src->format->Rmask, rz_src->format->Gmask,
+				rz_src->format->Bmask, rz_src->format->Amask);
+		} else {
+			/*
+			* Target surface is 8bit 
+			*/
+			rz_dst = SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 8, 0, 0, 0, 0);
+		}
+
+		/* Check target */
+		if (rz_dst == NULL)
+			return NULL;
+
+		/* Adjust for guard rows */
+		rz_dst->h = dstheight;
+
+		/*
+		* Lock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_LockSurface(rz_src);
+		}
+
+		/*
+		* Check which kind of surface we have 
+		*/
+		if (is32bit) {
+			/*
+			* Call the 32bit transformation routine to do the rotation (using alpha) 
+			*/
+			_transformSurfaceRGBA(rz_src, rz_dst, dstwidthhalf, dstheighthalf,
+				(int) (sanglezoominv), (int) (canglezoominv), 
+				flipx, flipy,
+				smooth);
+		} else {
+			/*
+			* Copy palette and colorkey info 
+			*/
+			for (i = 0; i < rz_src->format->palette->ncolors; i++) {
+				rz_dst->format->palette->colors[i] = rz_src->format->palette->colors[i];
+			}
+			rz_dst->format->palette->ncolors = rz_src->format->palette->ncolors;
+			/*
+			* Call the 8bit transformation routine to do the rotation 
+			*/
+			transformSurfaceY(rz_src, rz_dst, dstwidthhalf, dstheighthalf,
+				(int) (sanglezoominv), (int) (canglezoominv),
+				flipx, flipy);
+		}
+		/*
+		* Unlock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_UnlockSurface(rz_src);
+		}
+
+	} else {
+
+		/*
+		* Angle=0: Just a zoom 
+		*/
+		/*
+		* -------------------- 
+		*/
+
+		/*
+		* Calculate target size
+		*/
+		zoomSurfaceSize(rz_src->w, rz_src->h, zoomx, zoomy, &dstwidth, &dstheight);
+
+		/*
+		* Alloc space to completely contain the zoomed surface 
+		*/
+		rz_dst = NULL;
+		if (is32bit) {
+			/*
+			* Target surface is 32bit with source RGBA/ABGR ordering 
+			*/
+			rz_dst =
+				SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 32,
+				rz_src->format->Rmask, rz_src->format->Gmask,
+				rz_src->format->Bmask, rz_src->format->Amask);
+		} else {
+			/*
+			* Target surface is 8bit 
+			*/
+			rz_dst = SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 8, 0, 0, 0, 0);
+		}
+
+		/* Check target */
+		if (rz_dst == NULL)
+			return NULL;
+
+		/* Adjust for guard rows */
+		rz_dst->h = dstheight;
+
+		/*
+		* Lock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_LockSurface(rz_src);
+		}
+
+		/*
+		* Check which kind of surface we have 
+		*/
+		if (is32bit) {
+			/*
+			* Call the 32bit transformation routine to do the zooming (using alpha) 
+			*/
+			_zoomSurfaceRGBA(rz_src, rz_dst, flipx, flipy, smooth);
+
+		} else {
+			/*
+			* Copy palette and colorkey info 
+			*/
+			for (i = 0; i < rz_src->format->palette->ncolors; i++) {
+				rz_dst->format->palette->colors[i] = rz_src->format->palette->colors[i];
+			}
+			rz_dst->format->palette->ncolors = rz_src->format->palette->ncolors;
+
+			/*
+			* Call the 8bit transformation routine to do the zooming 
+			*/
+			_zoomSurfaceY(rz_src, rz_dst, flipx, flipy);
+		}
+
+		/*
+		* Unlock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_UnlockSurface(rz_src);
+		}
+	}
+
+	/*
+	* Cleanup temp surface 
+	*/
+	if (src_converted) {
+		SDL_FreeSurface(rz_src);
+	}
+
+	/*
+	* Return destination surface 
+	*/
+	return (rz_dst);
+}
+
+/*!
+\brief Calculates the size of the target surface for a zoomSurface() call.
+
+The minimum size of the target surface is 1. The input factors can be positive or negative.
+
+\param width The width of the source surface to zoom.
+\param height The height of the source surface to zoom.
+\param zoomx The horizontal zoom factor.
+\param zoomy The vertical zoom factor.
+\param dstwidth Pointer to an integer to store the calculated width of the zoomed target surface.
+\param dstheight Pointer to an integer to store the calculated height of the zoomed target surface.
+*/
+void zoomSurfaceSize(int width, int height, double zoomx, double zoomy, int *dstwidth, int *dstheight)
+{
+	/*
+	* Make zoom factors positive 
+	*/
+	int flipx, flipy;
+	flipx = (zoomx<0.0);
+	if (flipx) zoomx = -zoomx;
+	flipy = (zoomy<0.0);
+	if (flipy) zoomy = -zoomy;
+
+	/*
+	* Sanity check zoom factors 
+	*/
+	if (zoomx < VALUE_LIMIT) {
+		zoomx = VALUE_LIMIT;
+	}
+	if (zoomy < VALUE_LIMIT) {
+		zoomy = VALUE_LIMIT;
+	}
+
+	/*
+	* Calculate target size 
+	*/
+	*dstwidth = (int) floor(((double) width * zoomx) + 0.5);
+	*dstheight = (int) floor(((double) height * zoomy) + 0.5);
+	if (*dstwidth < 1) {
+		*dstwidth = 1;
+	}
+	if (*dstheight < 1) {
+		*dstheight = 1;
+	}
+}
+
+/*! 
+\brief Zoom a surface by independent horizontal and vertical factors with optional smoothing.
+
+Zooms a 32bit or 8bit 'src' surface to newly created 'dst' surface.
+'zoomx' and 'zoomy' are scaling factors for width and height. If 'smooth' is on
+then the destination 32bit surface is anti-aliased. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+If zoom factors are negative, the image is flipped on the axes.
+
+\param src The surface to zoom.
+\param zoomx The horizontal zoom factor.
+\param zoomy The vertical zoom factor.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return The new, zoomed surface.
+*/
+SDL_Surface *zoomSurface(SDL_Surface * src, double zoomx, double zoomy, int smooth)
+{
+	SDL_Surface *rz_src;
+	SDL_Surface *rz_dst;
+	int dstwidth, dstheight;
+	int is32bit;
+	int i, src_converted;
+	int flipx, flipy;
+
+	/*
+	* Sanity check 
+	*/
+	if (src == NULL)
+		return (NULL);
+
+	/*
+	* Determine if source surface is 32bit or 8bit 
+	*/
+	is32bit = (src->format->BitsPerPixel == 32);
+	if ((is32bit) || (src->format->BitsPerPixel == 8)) {
+		/*
+		* Use source surface 'as is' 
+		*/
+		rz_src = src;
+		src_converted = 0;
+	} else {
+		/*
+		* New source surface is 32bit with a defined RGBA ordering 
+		*/
+		rz_src =
+			SDL_CreateRGBSurface(SDL_SWSURFACE, src->w, src->h, 32, 
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+			0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000
+#else
+			0xff000000,  0x00ff0000, 0x0000ff00, 0x000000ff
+#endif
+			);
+		if (rz_src == NULL) {
+			return NULL;
+		}
+		SDL_BlitSurface(src, NULL, rz_src, NULL);
+		src_converted = 1;
+		is32bit = 1;
+	}
+
+	flipx = (zoomx<0.0);
+	if (flipx) zoomx = -zoomx;
+	flipy = (zoomy<0.0);
+	if (flipy) zoomy = -zoomy;
+
+	/* Get size if target */
+	zoomSurfaceSize(rz_src->w, rz_src->h, zoomx, zoomy, &dstwidth, &dstheight);
+
+	/*
+	* Alloc space to completely contain the zoomed surface 
+	*/
+	rz_dst = NULL;
+	if (is32bit) {
+		/*
+		* Target surface is 32bit with source RGBA/ABGR ordering 
+		*/
+		rz_dst =
+			SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 32,
+			rz_src->format->Rmask, rz_src->format->Gmask,
+			rz_src->format->Bmask, rz_src->format->Amask);
+	} else {
+		/*
+		* Target surface is 8bit 
+		*/
+		rz_dst = SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 8, 0, 0, 0, 0);
+	}
+
+	/* Check target */
+	if (rz_dst == NULL) {
+		/*
+		* Cleanup temp surface 
+		*/
+		if (src_converted) {
+			SDL_FreeSurface(rz_src);
+		}		
+		return NULL;
+	}
+
+	/* Adjust for guard rows */
+	rz_dst->h = dstheight;
+
+	/*
+	* Lock source surface 
+	*/
+	if (SDL_MUSTLOCK(rz_src)) {
+		SDL_LockSurface(rz_src);
+	}
+
+	/*
+	* Check which kind of surface we have 
+	*/
+	if (is32bit) {
+		/*
+		* Call the 32bit transformation routine to do the zooming (using alpha) 
+		*/
+		_zoomSurfaceRGBA(rz_src, rz_dst, flipx, flipy, smooth);
+	} else {
+		/*
+		* Copy palette and colorkey info 
+		*/
+		for (i = 0; i < rz_src->format->palette->ncolors; i++) {
+			rz_dst->format->palette->colors[i] = rz_src->format->palette->colors[i];
+		}
+		rz_dst->format->palette->ncolors = rz_src->format->palette->ncolors;
+		/*
+		* Call the 8bit transformation routine to do the zooming 
+		*/
+		_zoomSurfaceY(rz_src, rz_dst, flipx, flipy);
+	}
+	/*
+	* Unlock source surface 
+	*/
+	if (SDL_MUSTLOCK(rz_src)) {
+		SDL_UnlockSurface(rz_src);
+	}
+
+	/*
+	* Cleanup temp surface 
+	*/
+	if (src_converted) {
+		SDL_FreeSurface(rz_src);
+	}
+
+	/*
+	* Return destination surface 
+	*/
+	return (rz_dst);
+}
+
+/*! 
+\brief Shrink a surface by an integer ratio using averaging.
+
+Shrinks a 32bit or 8bit 'src' surface to a newly created 'dst' surface.
+'factorx' and 'factory' are the shrinking ratios (i.e. 2=1/2 the size,
+3=1/3 the size, etc.) The destination surface is antialiased by averaging
+the source box RGBA or Y information. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+The input surface is not modified. The output surface is newly allocated.
+
+\param src The surface to shrink.
+\param factorx The horizontal shrinking ratio.
+\param factory The vertical shrinking ratio.
+
+\return The new, shrunken surface.
+*/
+/*@null@*/ 
+SDL_Surface *shrinkSurface(SDL_Surface *src, int factorx, int factory)
+{
+	int result;
+	SDL_Surface *rz_src;
+	SDL_Surface *rz_dst = NULL;
+	int dstwidth, dstheight;
+	int is32bit;
+	int i, src_converted;
+	int haveError = 0;
+
+	/*
+	* Sanity check 
+	*/
+	if (src == NULL) {
+		return (NULL);
+	}
+
+	/*
+	* Determine if source surface is 32bit or 8bit 
+	*/
+	is32bit = (src->format->BitsPerPixel == 32);
+	if ((is32bit) || (src->format->BitsPerPixel == 8)) {
+		/*
+		* Use source surface 'as is' 
+		*/
+		rz_src = src;
+		src_converted = 0;
+	} else {
+		/*
+		* New source surface is 32bit with a defined RGBA ordering 
+		*/
+		rz_src = SDL_CreateRGBSurface(SDL_SWSURFACE, src->w, src->h, 32, 
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+			0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000
+#else
+			0xff000000,  0x00ff0000, 0x0000ff00, 0x000000ff
+#endif
+			);
+		if (rz_src==NULL) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+
+		SDL_BlitSurface(src, NULL, rz_src, NULL);
+		src_converted = 1;
+		is32bit = 1;
+	}
+
+	/*
+	* Lock the surface 
+	*/
+	if (SDL_MUSTLOCK(rz_src)) {
+		if (SDL_LockSurface(rz_src) < 0) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+	}
+
+	/* Get size for target */
+	dstwidth=rz_src->w/factorx;
+	while (dstwidth*factorx>rz_src->w) { dstwidth--; }
+	dstheight=rz_src->h/factory;
+	while (dstheight*factory>rz_src->h) { dstheight--; }
+
+	/*
+	* Alloc space to completely contain the shrunken surface
+	* (with added guard rows)
+	*/
+	if (is32bit==1) {
+		/*
+		* Target surface is 32bit with source RGBA/ABGR ordering 
+		*/
+		rz_dst =
+			SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 32,
+			rz_src->format->Rmask, rz_src->format->Gmask,
+			rz_src->format->Bmask, rz_src->format->Amask);
+	} else {
+		/*
+		* Target surface is 8bit 
+		*/
+		rz_dst = SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 8, 0, 0, 0, 0);
+	}
+
+	/* Check target */
+	if (rz_dst == NULL) {
+		haveError = 1;
+		goto exitShrinkSurface;
+	}
+
+	/* Adjust for guard rows */
+	rz_dst->h = dstheight;
+
+	/*
+	* Check which kind of surface we have 
+	*/
+	if (is32bit==1) {
+		/*
+		* Call the 32bit transformation routine to do the shrinking (using alpha) 
+		*/
+		result = _shrinkSurfaceRGBA(rz_src, rz_dst, factorx, factory);		
+		if ((result!=0) || (rz_dst==NULL)) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+	} else {
+		/*
+		* Copy palette and colorkey info 
+		*/
+		for (i = 0; i < rz_src->format->palette->ncolors; i++) {
+			rz_dst->format->palette->colors[i] = rz_src->format->palette->colors[i];
+		}
+		rz_dst->format->palette->ncolors = rz_src->format->palette->ncolors;
+		/*
+		* Call the 8bit transformation routine to do the shrinking 
+		*/
+		result = _shrinkSurfaceY(rz_src, rz_dst, factorx, factory);
+		if (result!=0) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+	}
+
+exitShrinkSurface:
+	if (rz_src!=NULL) {
+		/*
+		* Unlock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_UnlockSurface(rz_src);
+		}
+
+		/*
+		* Cleanup temp surface 
+		*/
+		if (src_converted==1) {
+			SDL_FreeSurface(rz_src);
+		}
+	}
+
+	/* Check error state; maybe need to cleanup destination */
+	if (haveError==1) {
+		if (rz_dst!=NULL) {
+			SDL_FreeSurface(rz_dst);
+		}
+		rz_dst=NULL;
+	} 
+
+	/*
+	* Return destination surface 
+	*/
+	return (rz_dst);
+}
diff --git a/src/SDLU.cpp b/src/SDLU.cpp
new file mode 100644
index 0000000..fa19952
--- /dev/null
+++ b/src/SDLU.cpp
@@ -0,0 +1,18 @@
+#include "SDLU.hpp"
+
+#include <SDL.h>
+#include "Util.hpp"
+
+SDLU_BEGIN
+
+int Initialize() {
+	return SDL_Init(SDL_INIT_EVERYTHING);
+}
+
+void Quit() {
+	SDL_Quit();
+}
+
+SDL_CommonEvent c;
+
+SDLU_END
\ No newline at end of file
diff --git a/src/graphics/RenderTarget.cpp b/src/graphics/RenderTarget.cpp
new file mode 100644
index 0000000..105c20c
--- /dev/null
+++ b/src/graphics/RenderTarget.cpp
@@ -0,0 +1,79 @@
+#include "graphics/RenderTarget.hpp"
+
+#include <SDL.h>
+#include <Util.hpp>
+
+SDLU_BEGIN
+RenderTarget::~RenderTarget()
+{
+    RETURN_IF_NULLPTR(renderer);
+
+    SDL_DestroyRenderer(renderer);
+}
+
+void RenderTarget::Clear(const Color& color)
+{
+    RETURN_IF_NULLPTR(renderer);
+
+    SDL_SetRenderDrawColor(renderer, color.r, color.g, color.b, color.a);
+    SDL_RenderClear(renderer);
+}
+
+void RenderTarget::Draw(const Drawable& drawable)
+{
+    RETURN_IF_NULLPTR(renderer);
+
+    drawable.Draw(renderer);
+}
+
+void RenderTarget::Display()
+{
+    RETURN_IF_NULLPTR(renderer);
+
+    SDL_RenderPresent(renderer);
+
+    if (m_oFramerate != 0)
+    {
+        Uint64 diff = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - m_oTimeSinceLastDisplay).count();
+
+        if (diff < 1000 / m_oFramerate)
+        {
+            SDL_Delay(static_cast<Uint32>(1000 / m_oFramerate - diff));
+        }
+    }
+
+    m_oTimeSinceLastDisplay = std::chrono::steady_clock::now();
+}
+
+void RenderTarget::SetMaxFramerate(Uint32 max)
+{
+    m_oFramerate = max;
+}
+
+RenderTarget::RenderTarget(SDL_Window* target) :
+    renderer(nullptr), m_oFramerate(0)
+{
+    RETURN_IF_NOT_NULLPTR(renderer);
+    renderer = SDL_CreateRenderer(target, -1, SDL_RENDERER_ACCELERATED);
+        
+    THROW_IF(IS_NULLPTR(renderer),
+        std::runtime_error("Failed to create SDL_Renderer* from SDL_Window*: " + std::string(SDL_GetError())));
+
+    m_oTimeSinceLastDisplay = std::chrono::steady_clock::now();
+}
+
+RenderTarget::RenderTarget(SDL_Surface* target) : 
+    renderer(nullptr), m_oFramerate(0)
+{
+    m_oFramerate = 0;
+
+    RETURN_IF_NOT_NULLPTR(renderer);
+    renderer = SDL_CreateSoftwareRenderer(target);
+
+    THROW_IF(IS_NULLPTR(renderer),
+        std::runtime_error("Failed to create SDL_Renderer* from SDL_Surface*: " + std::string(SDL_GetError())));
+
+    m_oTimeSinceLastDisplay = std::chrono::steady_clock::now();
+}
+SDLU_END
\ No newline at end of file
diff --git a/src/graphics/RenderWindow.cpp b/src/graphics/RenderWindow.cpp
new file mode 100644
index 0000000..9e29638
--- /dev/null
+++ b/src/graphics/RenderWindow.cpp
@@ -0,0 +1,37 @@
+#include "graphics/RenderWindow.hpp"
+
+#include <cstring>
+#include <Util.hpp>
+
+SDLU_BEGIN
+RenderWindow::RenderWindow() :
+    Window(), RenderTarget(window)
+{
+    // Empty
+}
+
+RenderWindow::RenderWindow(Vector2u dimension, const std::string& title,
+    Uint32 windowFlags) :
+    Window(dimension, title, windowFlags), RenderTarget(window)
+{
+    // Empty
+}
+
+RenderWindow::~RenderWindow()
+{
+    // Empty
+}
+
+void RenderWindow::OnCreate()
+{
+}
+
+bool RenderWindow::OnResize()
+{
+    return false;
+}
+
+void RenderWindow::OnClose()
+{
+}
+SDLU_END
\ No newline at end of file
diff --git a/src/graphics/drawable/Transformable.cpp b/src/graphics/drawable/Transformable.cpp
new file mode 100644
index 0000000..70a40ea
--- /dev/null
+++ b/src/graphics/drawable/Transformable.cpp
@@ -0,0 +1,94 @@
+#include "graphics/drawable/Transformable.hpp"
+
+SDLU_BEGIN
+Transformable::Transformable() :
+    position(0, 0), origin(0, 0),
+    scale(1.f, 1.f), rotation(0.f)
+{
+    // Empty
+}
+
+Transformable::~Transformable()
+{
+    // Empty
+}
+
+Vector2f Transformable::GetPosition()
+{
+    return position;
+}
+
+void Transformable::SetPosition(const Vector2f& position)
+{
+    this->position = position;
+}
+
+void Transformable::SetPosition(float x, float y)
+{
+    position = Vector2f(x, y);
+}
+
+void Transformable::Move(const Vector2f& position)
+{
+    this->position += position;
+}
+
+void Transformable::Move(float x, float y)
+{
+    position += Vector2f(x, y);
+}
+Vector2f Transformable::GetOrigin()
+{
+    return origin;
+}
+
+void Transformable::SetOrigin(const Vector2f& origin)
+{
+    this->origin = origin;
+}
+
+void Transformable::SetOrigin(float x, float y)
+{
+    origin = Vector2f(x, y);
+}
+
+Vector2f Transformable::GetScale()
+{
+    return scale;
+}
+
+void Transformable::SetScale(const Vector2f& scale)
+{
+    this->scale = scale;
+}
+
+void Transformable::SetScale(float x, float y)
+{
+    scale = Vector2f(x, y);
+}
+
+void Transformable::Scale(const Vector2f& scale)
+{
+    this->scale += scale;
+}
+
+void Transformable::Scale(float x, float y)
+{
+    scale += Vector2f(x, y);
+}
+
+float Transformable::GetRotation()
+{
+    return rotation;
+}
+
+void Transformable::SetRotation(float angle)
+{
+    rotation = angle;
+}
+
+void Transformable::Rotate(float angle)
+{
+    rotation += angle;
+}
+SDLU_END
\ No newline at end of file
diff --git a/SDLU/graphics/drawable/shapes/Rectangle.cpp b/src/graphics/drawable/shapes/Rectangle.cpp
similarity index 92%
rename from SDLU/graphics/drawable/shapes/Rectangle.cpp
rename to src/graphics/drawable/shapes/Rectangle.cpp
index 8c3354f..0ff2de3 100644
--- a/SDLU/graphics/drawable/shapes/Rectangle.cpp
+++ b/src/graphics/drawable/shapes/Rectangle.cpp
@@ -1,7 +1,8 @@
-#include "Rectangle.hpp"
-
+#include <graphics/drawable/shapes/Rectangle.hpp>
 #include <graphics/RenderTarget.hpp>
 
+#include <SDL.h>
+
 namespace sdlu
 {
     Rectangle::Rectangle() :
diff --git a/SDLU/graphics/drawable/shapes/Shape.cpp b/src/graphics/drawable/shapes/Shape.cpp
similarity index 85%
rename from SDLU/graphics/drawable/shapes/Shape.cpp
rename to src/graphics/drawable/shapes/Shape.cpp
index 5cb6a50..32edb8f 100644
--- a/SDLU/graphics/drawable/shapes/Shape.cpp
+++ b/src/graphics/drawable/shapes/Shape.cpp
@@ -1,4 +1,4 @@
-#include "Shape.hpp"
+#include <graphics/drawable/shapes/Shape.hpp>
 
 namespace sdlu
 {
diff --git a/src/structures/Color.cpp b/src/structures/Color.cpp
new file mode 100644
index 0000000..00d182f
--- /dev/null
+++ b/src/structures/Color.cpp
@@ -0,0 +1,162 @@
+#include "structures/Color.hpp"
+
+#include <math.h>
+#include <cmath>
+
+SDLU_BEGIN
+const Color Color::Black = Color(0, 0, 0);
+const Color Color::Red = Color(255, 0, 0);
+const Color Color::Green = Color(0, 255, 0);
+const Color Color::Blue = Color(0, 0, 255);
+const Color Color::Yellow = Color(255, 255, 0);
+const Color Color::Magenta = Color(255, 0, 255);
+const Color Color::Cyan = Color(0, 255, 255);
+const Color Color::White = Color(255, 255, 255);
+
+const Color Color::Transparent = Color(0, 0, 0, 0);
+
+Color::Color() :
+    r(0), g(0), b(0), a(0)
+{
+    // Empty
+}
+
+Color::Color(Uint8 r, Uint8 g, Uint8 b, Uint8 a) :
+    r(r), g(g), b(b), a(a)
+{
+    // Empty
+}
+
+Color::Color(Uint32 color) : 
+    r((color & 0xFF000000) >> 24),
+    g((color & 0x00FF0000) >> 16),
+    b((color & 0x0000FF00) >> 8),
+    a((color & 0x000000FF))
+{
+    // Empty
+}
+
+Uint32 Color::ToInt()
+{
+    Uint32 color = 0;
+    color |= r << 24;
+    color |= g << 16;
+    color |= b << 8;
+    color |= a;
+    return color;
+}
+
+Color Color::FromHSV(Uint16 h, Uint8 s, Uint8 v)
+{
+    // Normalize parameters
+    // H : [0, 360)
+    // S : [0, 1]
+    // V : [0, 1]
+    h -= std::floor(h / 360) * 360;
+    s = (s > 1) ? 1 : s;
+    v = (v > 1) ? 1 : v;
+
+    // Convert to RGBA
+    Uint16 H = std::floor(h / 60.f);
+    float f = (h / 60.f) - H;
+
+    Uint8 p = static_cast<Uint8>((v * (1 - s)) * 255);
+    Uint8 q = static_cast<Uint8>((v * (1 - s * f)) * 255);
+    Uint8 t = static_cast<Uint8>((v * (1 - s * (1 - f))) * 255);
+    v *= 255;
+
+    Color output;
+    switch (H)
+    {
+    case 0:
+    case 6:
+        output = Color(v, t, p);
+        break;
+    case 1:
+        output = Color(q, v, p);
+        break;
+    case 2:
+        output = Color(p, v, t);
+        break;
+    case 3:
+        output = Color(p, q, v);
+        break;
+    case 4:
+        output = Color(t, p, v);
+        break;
+    case 5:
+        output = Color(v, p, q);
+        break;
+    default:
+        break;
+    }
+
+    return output;
+}
+
+Color operator+(const Color& left, const Color& right)
+{
+    return Color((UINT8_MAX - left.r) < right.r ? 255 : left.r + right.r,
+        (UINT8_MAX - left.g) < right.g ? 255 : left.g + right.g,
+        (UINT8_MAX - left.b) < right.b ? 255 : left.b + right.b,
+        (UINT8_MAX - left.a) < right.a ? 255 : left.a + right.a);
+}
+
+Color operator-(const Color& left, const Color& right)
+{
+    return Color(left.r < right.r ? 0 : left.r - right.r,
+        left.g < right.g ? 0 : left.g - right.g,
+        left.b < right.b ? 0 : left.b - right.b,
+        left.a < right.a ? 0 : left.a - right.a);
+}
+
+Color operator*(const Color& left, const Color& right)
+{
+    return Color((UINT8_MAX / left.r) < right.r ? 255 : left.r * right.r,
+        (UINT8_MAX / left.g) < right.g ? 255 : left.g * right.g,
+        (UINT8_MAX / left.b) < right.b ? 255 : left.b * right.b,
+        (UINT8_MAX / left.a) < right.a ? 255 : left.a * right.a);
+}
+
+Color operator/(const Color& left, const Color& right)
+{
+    return Color(left.r / right.r,
+        left.g / right.g,
+        left.b / right.b,
+        left.a / right.a);
+}
+
+Color& operator+=(Color& left, const Color& right)
+{
+    left = left + right;
+    return left;
+}
+
+Color& operator-=(Color& left, const Color& right)
+{
+    left = left - right;
+    return left;
+}
+
+Color& operator*=(Color& left, const Color& right)
+{
+    left = left * right;
+    return left;
+}
+
+Color& operator/=(Color& left, const Color& right)
+{
+    left = left / right;
+    return left;
+}
+
+bool operator==(const Color& left, const Color& right)
+{
+    return ((left.r == right.r) && (left.g == right.g) && (left.b == right.b) && (left.a == right.a));
+}
+
+bool operator!=(const Color& left, const Color& right)
+{
+    return !(left == right);
+}
+SDLU_END
\ No newline at end of file
diff --git a/src/structures/Mouse.cpp b/src/structures/Mouse.cpp
new file mode 100644
index 0000000..206ca3a
--- /dev/null
+++ b/src/structures/Mouse.cpp
@@ -0,0 +1,37 @@
+#include <structures/Mouse.hpp>
+
+#include <SDL_mouse.h>
+
+SDLU_BEGIN
+Uint32 Mouse::GetButtonState()
+{
+    return SDL_GetMouseState(NULL, NULL);
+}
+
+bool Mouse::IsButtonDown(Button button)
+{
+    return (GetButtonState() & SDL_BUTTON(static_cast<int>(button)));
+}
+
+Vector2i Mouse::GetPosition()
+{
+    int x = 0, y = 0;
+    SDL_GetGlobalMouseState(&x, &y);
+    return Vector2i(x, y);
+}
+
+Vector2i Mouse::GetPosition(const RenderWindow& relativeTo)
+{
+    return GetPosition() - relativeTo.GetPosition();
+}
+
+void Mouse::SetPosition(const Vector2i& position)
+{
+    SDL_WarpMouseGlobal(position.x, position.y);
+}
+
+void Mouse::SetPosition(const Vector2i& position, const RenderWindow& relativeTo)
+{
+    SDL_WarpMouseInWindow(relativeTo.GetWindow(), position.x, position.y);
+}
+SDLU_END
\ No newline at end of file
diff --git a/src/structures/Window.cpp b/src/structures/Window.cpp
new file mode 100644
index 0000000..6591d02
--- /dev/null
+++ b/src/structures/Window.cpp
@@ -0,0 +1,245 @@
+#include "structures/Window.hpp"
+
+#include <SDL.h>
+#include <cstring>
+
+SDLU_BEGIN
+Window::Window() :
+    window(nullptr)
+{
+    // Empty
+}
+
+Window::Window(Vector2u dimension, const std::string& title, Uint32 windowFlags) :
+    Window()
+{
+    Create(dimension, title, windowFlags);
+}
+
+Window::~Window()
+{
+    Close();
+}
+
+void Window::Create(Vector2u dimension, const std::string& title, Uint32 windowFlags)
+{
+    // Don't create a window when it already exists
+    RETURN_IF_NOT_NULLPTR(window);
+
+    window = SDL_CreateWindow(title.c_str(),
+        SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
+        dimension.x, dimension.y,
+        windowFlags);
+
+    THROW_IF(IS_NULLPTR(window),
+        std::runtime_error("Failed to create SDL_Window. \nSDL_GetError(): " + std::string(SDL_GetError())));
+
+    OnCreate();
+}
+
+void Window::Close()
+{
+    // Don't destroy a window that doesn't exist
+    RETURN_IF_NULLPTR(window);
+
+    SDL_DestroyWindow(window);
+    window = nullptr;
+
+    OnClose();
+}
+
+bool Window::IsOpen() const
+{
+    RETURN_IF_NULLPTR(window, false);
+    return (!SDL_GetWindowID(window) ? false : true);
+}
+
+bool Window::PollEvent(SDL_Event* event)
+{
+    RETURN_IF_NULLPTR(window, false);
+    // Handle events before the user in case a derived
+    // class decides to block the event.
+    while (SDL_PollEvent(event))
+    {
+        switch (event->window.event)
+        {
+        case SDL_WINDOWEVENT_RESIZED: if (!OnResize()) return true; break;
+        default: return true;
+        }
+    }
+
+    event = NULL;
+    return false;
+}
+
+bool Window::WaitEvent(SDL_Event* event)
+{
+    while (!PollEvent(event)) continue;
+    return true;
+}
+
+Vector2i Window::GetPosition() const
+{
+    RETURN_IF_NULLPTR(window, Vector2i());
+
+    int x = 0, y = 0;
+    SDL_GetWindowPosition(window, &x, &y);
+    return Vector2i(x, y);
+}
+
+void Window::SetPosition(Vector2i position)
+{
+    RETURN_IF_NULLPTR(window);
+
+    SDL_SetWindowPosition(window, position.x, position.y);
+}
+
+void Window::SetPosition(int x, int y)
+{
+    RETURN_IF_NULLPTR(window);
+
+    SDL_SetWindowPosition(window, x, y);
+}
+
+Vector2u Window::GetSize() const
+{
+    RETURN_IF_NULLPTR(window, Vector2u());
+
+    int x = 0, y = 0;
+    SDL_GetWindowSize(window, &x, &y);
+    return Vector2u(x, y);
+}
+
+void Window::SetSize(Vector2u size)
+{
+    RETURN_IF_NULLPTR(window);
+
+    SDL_SetWindowSize(window, size.x, size.y);
+}
+
+void Window::SetSize(unsigned int width, unsigned int height)
+{
+    RETURN_IF_NULLPTR(window);
+
+    SDL_SetWindowSize(window, width, height);
+}
+
+std::string Window::GetTitle() const
+{
+    RETURN_IF_NULLPTR(window, "");
+
+    return SDL_GetWindowTitle(window);
+}
+
+void Window::SetTitle(std::string title)
+{
+    RETURN_IF_NULLPTR(window);
+
+    SDL_SetWindowTitle(window, title.c_str());
+}
+
+SDL_Window* const Window::GetWindow() const
+{
+    return window;
+}
+
+void Window::SetVisible(bool visible)
+{
+    RETURN_IF_NULLPTR(window);
+    if (visible)
+        SDL_ShowWindow(window);
+    else
+        SDL_HideWindow(window);
+}
+
+void Window::SetVsync(bool vsync)
+{
+    // SDL actually doesn't allow you to change the VSync
+    // flag of a Renderer after it's been created. This
+    // Changes it globally for all other windows
+    SDL_GL_SetSwapInterval(vsync);
+}
+
+void Window::SetMouseCursorVisible(bool visible)
+{
+    SDL_ShowCursor(visible);
+}
+
+void Window::SetMouseCursorGrabbed(bool grabbed)
+{
+    SDL_SetWindowGrab(window, grabbed ? SDL_TRUE : SDL_FALSE);
+}
+
+void Window::SetIcon(Uint32 width, Uint32 height, const Uint8* pixels)
+{
+    size_t size = static_cast<size_t>(width) * static_cast<size_t>(height) * 4;
+    void* _pixels = malloc(size);
+    memcpy(_pixels, pixels, size);
+    SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormatFrom(_pixels,
+        width, height, 32, 32 * width,
+        SDL_PIXELFORMAT_RGBA8888);
+
+    SDL_SetWindowIcon(window, surface);
+}
+
+void Window::SetIcon(Uint32 width, Uint32 height, const Uint32* pixels)
+{
+    size_t size = static_cast<size_t>(width) * static_cast<size_t>(height) * 4;
+    void* _pixels = malloc(size);
+    memcpy(_pixels, pixels, size);
+    SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormatFrom(_pixels,
+        width, height, 32, 4 * width,
+        SDL_PIXELFORMAT_RGBA8888);
+
+    SDL_SetWindowIcon(window, surface);
+}
+
+void Window::SetIcon(SDL_Surface* icon)
+{
+    SDL_SetWindowIcon(window, icon);
+}
+
+void Window::SetMouseCursor(SDL_Cursor* cursor)
+{
+    SDL_SetCursor(cursor);
+}
+
+void Window::SetMouseCursor(SDL_Surface* surface, Vector2u clickspot)
+{
+    SDL_Cursor* _cursor = SDL_CreateColorCursor(surface, clickspot.x, clickspot.y);
+    SDL_SetCursor(_cursor);
+}
+
+void Window::SetMouseCursor(const Uint8* pixels, Vector2u size, Vector2u clickspot)
+{
+    size_t _size = static_cast<size_t>(size.x) * static_cast<size_t>(size.y) * 4;
+    void* _pixels = malloc(_size);
+    memcpy(_pixels, pixels, _size);
+    SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormatFrom(_pixels,
+        size.x, size.y, 32, 8 * size.x, SDL_PIXELFORMAT_RGBA8888);
+    this->SetMouseCursor(surface, clickspot);
+}
+
+void Window::SetMouseCursor(const Uint32* pixels, Vector2u size, Vector2u clickspot)
+{
+    size_t _size = static_cast<size_t>(size.x) * static_cast<size_t>(size.y) * 4;
+    void* _pixels = malloc(_size);
+    memcpy(_pixels, pixels, _size);
+    SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormatFrom(_pixels,
+        size.x, size.y, 32, 8 * size.x, SDL_PIXELFORMAT_RGBA32);
+    this->SetMouseCursor(surface, clickspot);
+}
+
+void Window::OnCreate()
+{
+}
+
+bool Window::OnResize()
+{
+    return false;
+}
+
+void Window::OnClose()
+{
+}
+SDLU_END
\ No newline at end of file

From 1ec25d036c210032fae2449b96865c810cb4e2dc Mon Sep 17 00:00:00 2001
From: Robert <robert.trololo@gmail.com>
Date: Fri, 23 Apr 2021 16:29:26 +0200
Subject: [PATCH 3/3] Added cursor support

---
 examples/main.cpp             |  2 +-
 include/SDLU.hpp              |  1 +
 include/structures/Cursor.hpp | 38 +++++++++++++++++++++
 include/structures/Window.hpp | 25 +++-----------
 src/structures/Cursor.cpp     | 62 +++++++++++++++++++++++++++++++++++
 src/structures/Window.cpp     | 27 +++------------
 6 files changed, 110 insertions(+), 45 deletions(-)
 create mode 100644 include/structures/Cursor.hpp
 create mode 100644 src/structures/Cursor.cpp

diff --git a/examples/main.cpp b/examples/main.cpp
index 896b693..dd579da 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -27,7 +27,7 @@ int main(int argc, char** argv)
     window.SetTitle("New Title");
 
     window.SetIcon(64, 64, icon_data);
-    // window.SetMouseCursor(SDL_SYSTEM_CURSOR_CROSSHAIR);
+    window.SetMouseCursor(sdlu::Cursor::Type::Crosshair);
     window.SetMaxFramerate(144);
 
     SDL_Event event;
diff --git a/include/SDLU.hpp b/include/SDLU.hpp
index cb049e6..bee07e3 100644
--- a/include/SDLU.hpp
+++ b/include/SDLU.hpp
@@ -2,6 +2,7 @@
 
 #include <graphics/Graphics.hpp>
 #include <structures/Mouse.hpp>
+#include <structures/Cursor.hpp>
 
 namespace sdlu {
 	// TODO: Eventually we should initialize things once the object gets created
diff --git a/include/structures/Cursor.hpp b/include/structures/Cursor.hpp
new file mode 100644
index 0000000..bb55408
--- /dev/null
+++ b/include/structures/Cursor.hpp
@@ -0,0 +1,38 @@
+#pragma once 
+
+#include "Vector2.hpp"
+#include "Util.hpp"
+
+struct SDL_Cursor;
+struct SDL_Surface;
+
+SDLU_BEGIN
+
+class Cursor
+{
+public:
+	enum class Type {
+		Arrow, IBeam, Wait, Crosshair, WaitArrow,
+		SizeNWSE, SizeNESW, SizeWE, SizeNS, SizeAll,
+		No, Hand
+	};
+
+	friend class Window;
+
+public:
+	Cursor();
+	Cursor(Type type);
+	Cursor(const Cursor& other) = delete;
+	Cursor(Cursor&& other) noexcept;
+
+	~Cursor();
+
+	bool LoadFromPixels(const Uint8* pixels, Vector2u size, Vector2u hotspot);
+	bool LoadFromSurface(SDL_Surface* surface, Vector2u hotspot);
+	bool LoadFromSystem(Type type);
+
+private:
+	SDL_Cursor* cursor;
+};
+
+SDLU_END
\ No newline at end of file
diff --git a/include/structures/Window.hpp b/include/structures/Window.hpp
index 9925cc3..76a13e1 100644
--- a/include/structures/Window.hpp
+++ b/include/structures/Window.hpp
@@ -17,6 +17,8 @@ struct SDL_Surface;
 struct SDL_Cursor;
 
 SDLU_BEGIN
+class Cursor;
+
 /**
     * @brief Stores information about a window. You probably want RenderWindow.
     */
@@ -230,28 +232,9 @@ public:
     /**
         * @brief Changes the mouse cursor
         *
-        * @param[in] surface   A pointer to a SDL_Surface containing sprite data
-        * @param[in] clickspot The effective position of the cursor relative to the top left of the sprite
+        * @param[in] cursor   The cursor object holding cursor data
         */
-    void SetMouseCursor(SDL_Surface* surface, Vector2u clickspot);
-
-    /**
-        * @brief Changes the mouse cursor
-        *
-        * @param[in] pixels    An array of color data (RGBA as seperate 8-bit values)
-        * @param[in] size      Size of the cursor
-        * @param[in] clickspot The effective position of the cursor relative to the top left of the sprite
-        */
-    void SetMouseCursor(const Uint8* pixels, Vector2u size, Vector2u clickspot);
-
-    /**
-        * @brief Changes the mouse cursor
-        *
-        * @param[in] pixels    An array of color data (RGBA as one 32-bit value)
-        * @param[in] size      Size of the cursor
-        * @param[in] clickspot The effective position of the cursor relative to the top left of the sprite
-        */
-    void SetMouseCursor(const Uint32* pixels, Vector2u size, Vector2u clickspot);
+    void SetMouseCursor(const Cursor& cursor);
 
 protected:
     SDL_Window* window;
diff --git a/src/structures/Cursor.cpp b/src/structures/Cursor.cpp
new file mode 100644
index 0000000..c7bbfe0
--- /dev/null
+++ b/src/structures/Cursor.cpp
@@ -0,0 +1,62 @@
+#include "structures/Cursor.hpp"
+
+#include <SDL2/SDL_mouse.h>
+
+SDLU_BEGIN
+
+Cursor::Cursor() :
+	cursor(SDL_CreateSystemCursor(static_cast<SDL_SystemCursor>(Type::Arrow)))
+{
+	
+}
+
+Cursor::Cursor(Type type) :
+	cursor(SDL_CreateSystemCursor(static_cast<SDL_SystemCursor>(type)))
+{
+}
+
+Cursor::Cursor(Cursor&& other) noexcept
+{
+	this->cursor = other.cursor;
+	other.cursor = nullptr;
+}
+
+Cursor::~Cursor()
+{
+	SDL_FreeCursor(cursor);
+}
+
+bool Cursor::LoadFromPixels(const Uint8* pixels, Vector2u size, Vector2u hotspot)
+{
+	SDL_FreeCursor(cursor);
+	cursor = SDL_CreateCursor(pixels, nullptr, size.x, size.y, hotspot.x, hotspot.y);
+
+	if (IS_NULLPTR(cursor))
+		return false;
+
+	return true;
+}
+
+bool Cursor::LoadFromSurface(SDL_Surface* surface, Vector2u hotspot)
+{
+	SDL_FreeCursor(cursor);
+	cursor = SDL_CreateColorCursor(surface, hotspot.x, hotspot.y);
+
+	if (IS_NULLPTR(cursor))
+		return false;
+
+	return true;
+}
+
+bool Cursor::LoadFromSystem(Type type)
+{
+	SDL_FreeCursor(cursor);
+	cursor = SDL_CreateSystemCursor(static_cast<SDL_SystemCursor>(type));
+
+	if (IS_NULLPTR(cursor))
+		return false;
+
+	return true;
+}
+
+SDLU_END
\ No newline at end of file
diff --git a/src/structures/Window.cpp b/src/structures/Window.cpp
index 6591d02..027a950 100644
--- a/src/structures/Window.cpp
+++ b/src/structures/Window.cpp
@@ -3,6 +3,8 @@
 #include <SDL.h>
 #include <cstring>
 
+#include "structures/Cursor.hpp"
+
 SDLU_BEGIN
 Window::Window() :
     window(nullptr)
@@ -204,30 +206,9 @@ void Window::SetMouseCursor(SDL_Cursor* cursor)
     SDL_SetCursor(cursor);
 }
 
-void Window::SetMouseCursor(SDL_Surface* surface, Vector2u clickspot)
+void Window::SetMouseCursor(const Cursor& cursor)
 {
-    SDL_Cursor* _cursor = SDL_CreateColorCursor(surface, clickspot.x, clickspot.y);
-    SDL_SetCursor(_cursor);
-}
-
-void Window::SetMouseCursor(const Uint8* pixels, Vector2u size, Vector2u clickspot)
-{
-    size_t _size = static_cast<size_t>(size.x) * static_cast<size_t>(size.y) * 4;
-    void* _pixels = malloc(_size);
-    memcpy(_pixels, pixels, _size);
-    SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormatFrom(_pixels,
-        size.x, size.y, 32, 8 * size.x, SDL_PIXELFORMAT_RGBA8888);
-    this->SetMouseCursor(surface, clickspot);
-}
-
-void Window::SetMouseCursor(const Uint32* pixels, Vector2u size, Vector2u clickspot)
-{
-    size_t _size = static_cast<size_t>(size.x) * static_cast<size_t>(size.y) * 4;
-    void* _pixels = malloc(_size);
-    memcpy(_pixels, pixels, _size);
-    SDL_Surface* surface = SDL_CreateRGBSurfaceWithFormatFrom(_pixels,
-        size.x, size.y, 32, 8 * size.x, SDL_PIXELFORMAT_RGBA32);
-    this->SetMouseCursor(surface, clickspot);
+    SDL_SetCursor(cursor.cursor);
 }
 
 void Window::OnCreate()