#include "imageops.h"

#include <emmintrin.h>
#include <tmmintrin.h>
#include <stdexcept>
#include <cstring>

using namespace Retro;
using namespace std;

static void image565To888(const uint16_t* in, uint8_t* out, size_t w, size_t h, size_t stride);
static void imageX888To888(const uint32_t* in, uint8_t* out, size_t w, size_t h, size_t stride);

const static __m128i maskR16 = _mm_set1_epi16(0xF800);
const static __m128i maskG16 = _mm_set1_epi16(0x07E0);
const static __m128i maskB16 = _mm_set1_epi16(0x001F);

static inline void _convert565To888(const __m128i* in, __m128i* out) {
	/* 00 R0 00 R1 00 R2 00 R3 00 R4 00 R5 00 R6 00 R7 -> R0 00 00 R1 00 00 R2 00 00 R3 00 00 R4 00 00 R5 */
	const static __m128i rblend00 = _mm_set_epi8(0x0A, 0x80, 0x80, 0x08, 0x80, 0x80, 0x06, 0x80, 0x80, 0x04, 0x80, 0x80, 0x02, 0x80, 0x80, 0x00);

	/* 00 R0 00 R1 00 R2 00 R3 00 R4 00 R5 00 R6 00 R7 -> 00 00 R6 00 00 R7 00 00 00 00 00 00 00 00 00 00 */
	const static __m128i rblend10 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0E, 0x80, 0x80, 0x0C, 0x80, 0x80);

	/* 00 R8 00 R9 00 RA 00 RB 00 RC 00 RD 00 RE 00 RF -> 00 00 00 00 00 00 00 00 R8 00 00 R9 00 00 RA 00 */
	const static __m128i rblend11 = _mm_set_epi8(0x80, 0x04, 0x80, 0x80, 0x02, 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);

	/* 00 R8 00 R9 00 RA 00 RB 00 RC 00 RD 00 RE 00 RF -> 00 RB 00 00 RC 00 00 RD 00 00 RE 00 00 RF 00 00 */
	const static __m128i rblend21 = _mm_set_epi8(0x80, 0x80, 0x0E, 0x80, 0x80, 0x0C, 0x80, 0x80, 0x0A, 0x80, 0x80, 0x08, 0x80, 0x80, 0x06, 0x80);

	/* 00 G0 00 G1 00 G2 00 G3 00 G4 00 G5 00 G6 00 G7 -> 00 G0 00 00 G1 00 00 G2 00 00 G3 00 00 G4 00 00 */
	const static __m128i gblend00 = _mm_set_epi8(0x80, 0x80, 0x08, 0x80, 0x80, 0x06, 0x80, 0x80, 0x04, 0x80, 0x80, 0x02, 0x80, 0x80, 0x00, 0x80);

	/* 00 G0 00 G1 00 G2 00 G3 00 G4 00 G5 00 G6 00 G7 -> G5 00 00 G6 00 00 G7 00 00 00 00 00 00 00 00 00 */
	const static __m128i gblend10 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0E, 0x80, 0x80, 0x0C, 0x80, 0x80, 0x0A);

	/* 00 G8 00 G9 00 GA 00 GB 00 GC 00 GD 00 GE 00 GF -> 00 00 00 00 00 00 00 00 00 G8 00 00 G9 00 00 GA */
	const static __m128i gblend11 = _mm_set_epi8(0x04, 0x80, 0x80, 0x02, 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);

	/* 00 G8 00 G9 00 GA 00 GB 00 GC 00 GD 00 GE 00 GF -> 00 00 GB 00 00 GC 00 00 GD 00 00 GE 00 00 GF 00 */
	const static __m128i gblend21 = _mm_set_epi8(0x80, 0x0E, 0x80, 0x80, 0x0C, 0x80, 0x80, 0x0A, 0x80, 0x80, 0x08, 0x80, 0x80, 0x06, 0x80, 0x80);

	/* 00 B0 00 B1 00 B2 00 B3 00 B4 00 B5 00 B6 00 B7 -> 00 00 B0 00 00 B1 00 00 B2 00 00 B3 00 00 B4 00 */
	const static __m128i bblend00 = _mm_set_epi8(0x80, 0x08, 0x80, 0x80, 0x06, 0x80, 0x80, 0x04, 0x80, 0x80, 0x02, 0x80, 0x80, 0x00, 0x80, 0x80);

	/* 00 B0 00 B1 00 B2 00 B3 00 B4 00 B5 00 B6 00 B7 -> 00 B5 00 00 B6 00 00 B7 00 00 00 00 00 00 00 00 */
	const static __m128i bblend10 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0E, 0x80, 0x80, 0x0C, 0x80, 0x80, 0x0A, 0x80);

	/* 00 B8 00 B9 00 BA 00 BB 00 BC 00 BD 00 BE 00 BF -> 00 00 00 00 00 00 00 00 00 00 B8 00 00 B9 00 00 */
	const static __m128i bblend11 = _mm_set_epi8(0x80, 0x80, 0x02, 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);

	/* 00 B8 00 B9 00 BA 00 BB 00 BC 00 BD 00 BE 00 BF -> BA 00 00 BB 00 00 BC 00 00 BD 00 00 BE 00 00 BF */
	const static __m128i bblend21 = _mm_set_epi8(0x0E, 0x80, 0x80, 0x0C, 0x80, 0x80, 0x0A, 0x80, 0x80, 0x08, 0x80, 0x80, 0x06, 0x80, 0x80, 0x04);

	__m128i pix0 = _mm_load_si128(&in[0]);
	__m128i pix1 = _mm_load_si128(&in[1]);

	// Mask out channels
	__m128i r0 = _mm_and_si128(pix0, maskR16);
	__m128i g0 = _mm_and_si128(pix0, maskG16);
	__m128i b0 = _mm_and_si128(pix0, maskB16);
	__m128i r1 = _mm_and_si128(pix1, maskR16);
	__m128i g1 = _mm_and_si128(pix1, maskG16);
	__m128i b1 = _mm_and_si128(pix1, maskB16);

	// Normalize channels to 16-bit per channel
	r0 = _mm_srli_epi16(r0, 8);
	g0 = _mm_srli_epi16(g0, 3);
	b0 = _mm_slli_epi16(b0, 3);
	r1 = _mm_srli_epi16(r1, 8);
	g1 = _mm_srli_epi16(g1, 3);
	b1 = _mm_slli_epi16(b1, 3);

	// Halve channel width and mix to discrete bytes
	__m128i out0 = _mm_shuffle_epi8(r0, rblend00);
	out0 = _mm_or_si128(out0, _mm_shuffle_epi8(g0, gblend00));
	out0 = _mm_or_si128(out0, _mm_shuffle_epi8(b0, bblend00));

	__m128i out1 = _mm_shuffle_epi8(r0, rblend10);
	out1 = _mm_or_si128(out1, _mm_shuffle_epi8(g0, gblend10));
	out1 = _mm_or_si128(out1, _mm_shuffle_epi8(b0, bblend10));
	out1 = _mm_or_si128(out1, _mm_shuffle_epi8(r1, rblend11));
	out1 = _mm_or_si128(out1, _mm_shuffle_epi8(g1, gblend11));
	out1 = _mm_or_si128(out1, _mm_shuffle_epi8(b1, bblend11));

	__m128i out2 = _mm_shuffle_epi8(r1, rblend21);
	out2 = _mm_or_si128(out2, _mm_shuffle_epi8(g1, gblend21));
	out2 = _mm_or_si128(out2, _mm_shuffle_epi8(b1, bblend21));

	_mm_store_si128(&out[0], out0);
	_mm_store_si128(&out[1], out1);
	_mm_store_si128(&out[2], out2);
}

void image565To888(const uint16_t* in, uint8_t* out, size_t w, size_t h, size_t stride) {
	for (size_t y = 0; y < h; ++y) {
		size_t x;
		for (x = 0; x < w; x += 16) {
			_convert565To888(reinterpret_cast<const __m128i*>(&in[x]), reinterpret_cast<__m128i*>(out));
			out += 16 * 3;
		}
		for (; x < w; ++x) {
			uint16_t rgb = in[x];
			out[0] = (rgb & 0xF800) >> 8;
			out[1] = (rgb & 0x07E0) >> 3;
			out[2] = (rgb & 0x001F) << 3;
			out += 3;
		}
		in += stride / 2;
	}
}

void imageX888To888(const uint32_t* in, uint8_t* out, size_t w, size_t h, size_t stride) {
	for (size_t y = 0; y < h; ++y) {
		size_t x;
		for (x = 0; x + 15 < w; x += 16) {
			/* B0 G0 R0 X0 B1 G1 R1 X1 B2 G2 R2 X2 B3 G3 R3 X3 -> R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 00 00 00 00 */
			const static __m128i blend00 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x0C, 0x0D, 0x0E, 0x08, 0x09, 0x0A, 0x04, 0x05, 0x06, 0x00, 0x01, 0x02);

			/* B4 G4 R4 X4 B5 G5 R5 X5 B6 G6 R6 X6 B7 G7 R7 X7 -> 00 00 00 00 00 00 00 00 00 00 00 00 R4 G4 B4 R5 */
			const static __m128i blend01 = _mm_set_epi8(0x06, 0x00, 0x01, 0x02, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);

			/* B4 G4 R4 X4 B5 G5 R5 X5 B6 G6 R6 X6 B7 G7 R7 X7 -> G5 B5 R6 G6 B6 R7 G7 B7 00 00 00 00 00 00 00 00 */
			const static __m128i blend11 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0C, 0x0D, 0x0E, 0x08, 0x09, 0x0A, 0x04, 0x05);

			/* B8 G8 R8 X8 B9 G9 R9 X9 BA GA RA XA BB GB RB XB -> 00 00 00 00 00 00 00 00 R8 G8 B8 R9 G9 B9 RA GA */
			const static __m128i blend12 = _mm_set_epi8(0x09, 0x0A, 0x04, 0x05, 0x06, 0x00, 0x01, 0x02, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);

			/* B8 G8 R8 X8 B9 G9 R9 X9 BA GA RA XA BB GB RB XB -> BA RB GB BB 00 00 00 00 00 00 00 00 00 00 00 00 */
			const static __m128i blend22 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0C, 0x0D, 0x0E, 0x08);

			/* BC GC RC XC BD GD RD XD BE GE RE XE BF GF RF XF -> 00 00 00 00 RC GC BC RD GD BD RE GE BE RF GF DF */
			const static __m128i blend23 = _mm_set_epi8(0x0C, 0x0D, 0x0E, 0x08, 0x09, 0x0A, 0x04, 0x05, 0x06, 0x00, 0x01, 0x02, 0x80, 0x80, 0x80, 0x80);

			__m128i pix0 = _mm_load_si128(reinterpret_cast<const __m128i*>(&in[x]));
			__m128i pix1 = _mm_load_si128(reinterpret_cast<const __m128i*>(&in[x + 4]));
			__m128i pix2 = _mm_load_si128(reinterpret_cast<const __m128i*>(&in[x + 8]));
			__m128i pix3 = _mm_load_si128(reinterpret_cast<const __m128i*>(&in[x + 12]));

			__m128i out0 = _mm_shuffle_epi8(pix0, blend00);
			out0 = _mm_or_si128(out0, _mm_shuffle_epi8(pix1, blend01));

			__m128i out1 = _mm_shuffle_epi8(pix1, blend11);
			out1 = _mm_or_si128(out1, _mm_shuffle_epi8(pix2, blend12));

			__m128i out2 = _mm_shuffle_epi8(pix2, blend22);
			out2 = _mm_or_si128(out2, _mm_shuffle_epi8(pix3, blend23));

			_mm_storeu_si128(reinterpret_cast<__m128i*>(&out[0]), out0);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&out[16]), out1);
			_mm_storeu_si128(reinterpret_cast<__m128i*>(&out[32]), out2);
			out += 48;
		}
		for (; x < w; ++x) {
			uint32_t xrgb = in[x];
			out[0] = xrgb;
			out[1] = xrgb >> 8;
			out[2] = xrgb >> 16;
			out += 3;
		}
		in += stride / 4;
	}
}

Image::Image(Format format, const void* in, size_t w, size_t h, size_t stride)
	: m_constBuffer(in)
	, m_w(w)
	, m_h(h)
	, m_stride(stride)
	, m_format(format) {
}

Image::Image(Format format, void* in, size_t w, size_t h, size_t stride)
	: m_constBuffer(in)
	, m_buffer(in)
	, m_w(w)
	, m_h(h)
	, m_stride(stride)
	, m_format(format) {
}

void Image::copyTo(Image* other) {
	if (m_w != other->m_w || m_h != other->m_h) {
		throw invalid_argument("Image dimensions don't match");
	}
	switch (m_format) {
	case Image::Format::RGB565:
		switch (other->m_format) {
		case Image::Format::RGB565:
			copyDirectlyTo(other);
			break;
		case Image::Format::RGB888:
			image565To888(static_cast<const uint16_t*>(m_constBuffer), static_cast<uint8_t*>(other->m_buffer), m_w, m_h, m_stride);
			break;
		default:
			throw logic_error("unimplemented conversion");
		}
		break;
	case Image::Format::RGB888:
		switch (other->m_format) {
		case Image::Format::RGB888:
			copyDirectlyTo(other);
		default:
			throw logic_error("unimplemented conversion");
		}
		break;
	case Image::Format::RGBX888:
		switch (other->m_format) {
		case Image::Format::RGBX888:
			copyDirectlyTo(other);
			break;
		case Image::Format::RGB888:
			imageX888To888(static_cast<const uint32_t*>(m_constBuffer), static_cast<uint8_t*>(other->m_buffer), m_w, m_h, m_stride);
			break;
		default:
			throw logic_error("unimplemented conversion");
		}
		break;
	}
}

void Image::copyDirectlyTo(Image* other) {
	size_t depth = 1;
	switch (m_format) {
	case Image::Format::RGB565:
		depth = 2;
		break;
	case Image::Format::RGB888:
		depth = 3;
		break;
	case Image::Format::RGBX888:
		depth = 4;
		break;
	}
	if (m_stride == other->m_stride) {
		memcpy(other->m_buffer, m_constBuffer, m_stride * m_h);
	} else {
		const uint8_t* in = static_cast<const uint8_t*>(m_constBuffer);
		uint8_t* out = static_cast<uint8_t*>(other->m_buffer);
		for (size_t y = 0; y < m_h; ++y) {
			memcpy(&out[other->m_stride * y], &in[m_stride * y], depth * y * m_w);
		}
	}
}