#include "pyramid.h"

#ifdef PYR_DENOISE
#define USE_SSE2
#include "sse_mathfun.h"
#endif

//define _MM_SHUFFLE_PS1 _mm_shuffle_ps
//#define _MM_SHUFFLE_PS1(A,B,C) (_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(A), C)))

/*
#include <cstdarg>
#include <Windows.h>
void p_debug(const char* a, ...) {
	char string[4096];
	va_list args;

	va_start(args, a);
	vsprintf_s(string, a, args);
	va_end(args);

	OutputDebugString(string);
}
*/

/***********************************************************************
* Constructor/destructor
***********************************************************************/
Pyramid::Pyramid(int width, int height, int _levels, Pyramid* share, bool _short, int threads) : Pyramid(width, height, _levels, 0, 0, false, share, _short, threads) {
}

Pyramid::Pyramid(int width, int height, int _levels, int x, int y, bool _no_alloc, Pyramid* share, bool _short, int threads) : shared(share != NULL), no_alloc(_no_alloc), short_data(_short) {
	void* data;

	int n_levels = DefaultNumLevels(width, height);
	if (_levels) {
		if (_levels < 0) n_levels -= _levels; else n_levels = _levels;
	}

	int b, req_alignment;

	b = 0;
	req_alignment = 2;

	for (int n = 0; n < n_levels; ++n) {
		bool x_shift = (x - b) & (req_alignment - 1);
		bool y_shift = (y - b) & (req_alignment - 1);
		int pitch = width + x_shift;
		if (short_data) {
			pitch = (pitch + 15) & ~15; // 1x256
		} else {
			pitch = (pitch + 7) & ~7;
		}
		size_t bytes = (size_t)pitch * ((height + y_shift + 3) & ~3) * (short_data ? sizeof(int16_t) : sizeof(float)); // for blur transpose only?
		total_bytes += bytes;

		if (shared) {
			data = share->levels[levels.size()].data;
		} else {
			if (!no_alloc) {
				data = _aligned_malloc(bytes, 16); // was 32
				if (!data) {
					for (int j = 0; j < n; ++j) _aligned_free(levels[j].data);
					throw(bytes);
				}
			} else {
				data = NULL;
			}
		}

		levels.push_back({ width, height, pitch, bytes, data, x, y, x_shift, y_shift });

		x -= (x_shift << n);
		y -= (y_shift << n);
	
		x -= req_alignment;
		y -= req_alignment;

		b -= req_alignment;

		req_alignment <<= 1;

		width = (width + x_shift + 6) >> 1;
		height = (height + y_shift + 6) >> 1;
	}

	threadpool = Threadpool::GetInstance();
	threads = threads <= 0 ? threadpool->GetNThreads() : (std::min)(threads, threadpool->GetNThreads());

	SetBands(threads);
//	SetCols();

//	lines = new __m128*[threadpool->GetNThreads()][3];

	for (int i = 0; i < threads; ++i) { // threadpool->GetNThreads(); ++i) {
		if (short_data) {
			temp_lines.push_back({
				_aligned_malloc(levels[0].pitch * sizeof(int16_t), 32),
				_aligned_malloc(levels[0].pitch * sizeof(int16_t), 32),
				_aligned_malloc(levels[0].pitch * sizeof(int16_t), 32)
//				_aligned_malloc(levels[0].pitch * sizeof(int16_t), 32),
//				_aligned_malloc(levels[0].pitch * sizeof(int16_t), 32)
				});
		} else {
			temp_lines.push_back({
				_aligned_malloc(levels[0].pitch * sizeof(float), 16),
				_aligned_malloc(levels[0].pitch * sizeof(float), 16),
				_aligned_malloc(levels[0].pitch * sizeof(float), 16)
				});
		}
//		lines[i] = (__m128*)_aligned_malloc(levels[0].pitch * sizeof(float), 16); // was 32
	}
}

void Pyramid::SetBands(int threads, int level) {
	auto level_p = levels.begin();
	auto level_end = levels.end();

	if (level >= 0) {
		level_p += level;
		level_end = level_p + 1;
	}

	for (; level_p < level_end; ++level_p) {
		level_p->bands.clear();
		level_p->bands.push_back(0);
		if (level_p->height > threads * 4) { // bands should be at least four pixels high to be safe for shrink
			for (int i = 1; i < threads; ++i) {
				int b = ((int)((float)(level_p->height * i) / threads) & ~3); // was &~1 to make all bands even height (not sure of reason; probably so LaplaceThread behaves correctly); changed to &~3 for vertical SSE processing (see fastblur)
				level_p->bands.push_back(b);
			}
		}

		level_p->bands.push_back(level_p->height);
	}
}

/*
void Pyramid::SetCols(int level) {
	auto level_p = levels.begin();
	auto level_end = levels.end();

	if (level >= 0) {
		level_p += level;
		level_end = level_p + 1;
	}

	for (; level_p < level_end; ++level_p) {
		level_p->cols.clear();
		level_p->cols.push_back(0);
		if (level_p->height > threadpool->GetNThreads() * 4) { // cols should be at least four pixels wide
			for (int i = 1; i < threadpool->GetNThreads(); ++i) {
				int b = ((int)((float)(level_p->pitch * i) / threadpool->GetNThreads()) & ~3); // &~3 for vertical SSE processing (see fastblur)
				level_p->cols.push_back(b);
			}
		}

		level_p->cols.push_back(level_p->pitch);
	}
}
*/

Pyramid::~Pyramid() {
	for (auto& l : temp_lines) {
		for (auto& _l : l) {
			_aligned_free(_l);
		}
	}
//	for (int i = 0; i < threadpool->GetNThreads(); ++i) {
//		_aligned_free(lines[i]);
//	}
//	delete lines;

	if (!shared && !no_alloc) {
		for (auto it = levels.begin(); it < levels.end(); ++it) {
			_aligned_free(it->data);
		}
	}
	levels.clear();

	free(lut);
}

/***********************************************************************
* copiers
***********************************************************************/
void Pyramid::SetLUT(int bits, bool gamma) {
	if (lut_bits < bits || lut_gamma != gamma || !lut) {
		free(lut);
		lut = (float*)malloc((1 << bits) << 2); 
		lut_bits = bits;
		lut_gamma = gamma;
	}

	unsigned int l = 1 << bits;
	if (gamma) {
		for (unsigned int i = 0; i < l; ++i) {
			lut[i] = (float)(i * i);
		}
	} else {
		for (unsigned int i = 0; i < l; ++i) {
			lut[i] = (float)i;
		}
	}
}

void Pyramid::Copy(uint8_t* src_p, int step, int pitch, bool gamma, int bits) {
	if (short_data) {
		if (bits != 8) throw("Invalid copy for short data pyramid");
		if (gamma) throw("Gamma is invalid for short data pyramid");
		bits = 7;
	} else {
		if (bits == 7) throw("Invalid value (bits)");
	}

	if (step > 1) {
		SetLUT(bits, gamma);

		for (int t = 0; t < (int)levels[0].bands.size() - 1; ++t) {
			switch (bits) {
				case 7: throw("Not yet handled"); break;
				case 8: threadpool->Queue([=] { CopyInterleavedThread_8bit(src_p, step, pitch, levels[0].bands[t], levels[0].bands[t + 1]); }); break;
				case 16: threadpool->Queue([=] { CopyInterleavedThread_16bit((uint16_t*)src_p, step, pitch, levels[0].bands[t], levels[0].bands[t + 1]); }); break;
				default: throw("Invalid copy specification");
			}
		}
		threadpool->Wait();
	} else {
		for (int t = 0; t < (int)levels[0].bands.size() - 1; ++t) {
// planar (only slight improvement with MT, but increases CPU usage)
			switch (bits) {
				case 7: threadpool->Queue([=] { CopyPlanarThread_Short(src_p, pitch, levels[0].bands[t], levels[0].bands[t + 1]); }); break;
				case 8: threadpool->Queue([=] { CopyPlanarThread_8bit(src_p, pitch, gamma, levels[0].bands[t], levels[0].bands[t + 1]); }); break;
				case 10:
				case 12:
				case 14:
				case 16: threadpool->Queue([=] { CopyPlanarThread_16bit((uint16_t*)src_p, pitch, gamma,   levels[0].bands[t], levels[0].bands[t + 1]); }); break;
				case 32: threadpool->Queue([=] { CopyPlanarThread_32bit((__m128*)src_p, pitch, gamma, levels[0].bands[t], levels[0].bands[t + 1]); }); break;
				default: throw("Invalid copy specification");
			}
		}

		threadpool->Wait();
	}
}

void Pyramid::CopyInterleavedThread_8bit(uint8_t* src_p, int step, int pitch, int sy, int ey) {
	int x, y;
	uint8_t* src_pp;

	src_p += sy * pitch;

	float* p_p = (float*)levels[0].data;
	p_p += sy * levels[0].pitch;

	for (y = sy; y < ey; ++y) {
		src_pp = src_p;
		for (x = 0; x < levels[0].width; ++x) {
			p_p[x] = lut[*src_pp];
			src_pp += step;
		}
		for (; x < levels[0].pitch; ++x) {
			p_p[x] = p_p[x - 1]; // this was commented out, not sure if required
		}
		p_p += levels[0].pitch;
		src_p += pitch;
	}
}

void Pyramid::CopyInterleavedThread_16bit(uint16_t* src_p, int step, int pitch, int sy, int ey) {
	if (short_data) throw("Invalid copy for short data pyramid");
	int x, y;
	uint16_t* src_pp;

	src_p += sy * pitch;

	float* p_p = (float*)levels[0].data;
	p_p += sy * levels[0].pitch;

	for (y = sy; y < ey; ++y) {
		src_pp = src_p;
		for (x = 0; x < levels[0].width; ++x) {
			p_p[x] = lut[*src_pp];
			src_pp += step;
		}
		for (; x < levels[0].pitch; ++x) {
			p_p[x] = p_p[x - 1];
		}
		p_p += levels[0].pitch;
		src_p += pitch;
	}
}

void Pyramid::CopyPlanarThread_8bit(uint8_t* src_p, int pitch, bool gamma, int sy, int ey) {
	int x, y;
	__m128i pixels;
	__m128 fpixels;
	__m128i shuffle = _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000);
	__m128* rp_p = (__m128*)levels[0].data;
	__m128* p_p;
	__m128i* src_pp_m;
	int* src_pp_i;
	uint8_t* src_pp_b;

	int sixteens = levels[0].width >> 4;
	int fours = (levels[0].width & 0xf) >> 2;
	int ones = (levels[0].width & 3);
	int extras = levels[0].pitch - levels[0].width;

	src_p += sy * pitch;
	rp_p += sy * levels[0].m128_pitch();

	int g;

	for (y = sy; y < ey; ++y) {
		src_pp_m = (__m128i*)src_p;
		p_p = rp_p;
		if (gamma) {
			for (x = 0; x < sixteens; ++x) {
				pixels = _mm_loadu_si128(src_pp_m++);
				fpixels = _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle));
				_mm_store_ps((float*)p_p++, _mm_mul_ps(fpixels, fpixels));

				pixels = _mm_srli_si128(pixels, 4);
				fpixels = _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle));
				_mm_store_ps((float*)p_p++, _mm_mul_ps(fpixels, fpixels));

				pixels = _mm_srli_si128(pixels, 4);
				fpixels = _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle));
				_mm_store_ps((float*)p_p++, _mm_mul_ps(fpixels, fpixels));

				pixels = _mm_srli_si128(pixels, 4);
				fpixels = _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle));
				_mm_store_ps((float*)p_p++, _mm_mul_ps(fpixels, fpixels));
			}
		} else {
			for (x = 0; x < sixteens; ++x) {
				pixels = _mm_loadu_si128(src_pp_m++);
				_mm_store_ps((float*)p_p++, _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle)));

				pixels = _mm_srli_si128(pixels, 4);
				_mm_store_ps((float*)p_p++, _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle)));

				pixels = _mm_srli_si128(pixels, 4);
				_mm_store_ps((float*)p_p++, _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle)));

				pixels = _mm_srli_si128(pixels, 4);
				_mm_store_ps((float*)p_p++, _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle)));
			}
		}

		src_pp_i = (int*)src_pp_m;

		for (x = 0; x < fours; ++x) {
			if (gamma) {
				fpixels = _mm_cvtepi32_ps(_mm_shuffle_epi8(_mm_cvtsi32_si128(*src_pp_i++), shuffle));
				_mm_store_ps((float*)p_p++, _mm_mul_ps(fpixels, fpixels));
			} else {
				_mm_store_ps((float*)p_p++, _mm_cvtepi32_ps(_mm_shuffle_epi8(_mm_cvtsi32_si128(*src_pp_i++), shuffle)));
			}
		}

		src_pp_b = (uint8_t*)src_pp_i;
		float* fp_p = (float*)p_p;

		for (x = 0; x < ones; ++x) {
			if (gamma) {
				g = *src_pp_b++;
				*fp_p++ = (float)g * g;
			} else {
				*fp_p++ = (float)*src_pp_b++;
			}
		}

		float extra = fp_p[-1];
		for (x = 0; x < extras; ++x) {
			*fp_p++ = extra;
		}

		src_p += pitch;
		rp_p += levels[0].m128_pitch();
	}
}

void Pyramid::CopyPlanarThread_16bit(uint16_t* src_p, int pitch, bool gamma, int sy, int ey) {
	if (short_data) throw("Invalid copy for short data pyramid");
	int x, y;

	__m128i pixels;
	__m128 fpixels;
	__m128i shuffle1 = _mm_set_epi32(0x80800706, 0x80800504, 0x80800302, 0x80800100);
	__m128i shuffle2 = _mm_set_epi32(0x80800f0e, 0x80800d0c, 0x80800b0a, 0x80800908);
	__m128* rp_p = (__m128*)levels[0].data;
	__m128* p_p;
	__m128i* src_pp_m;
	uint16_t* src_pp_w;

	int eights = levels[0].width >> 3;
	int ones = (levels[0].width & 7);
	int extras = levels[0].pitch - levels[0].width;

	src_p += sy * pitch;
	rp_p += sy * levels[0].m128_pitch();

	int g;

	for (y = sy; y < ey; ++y) {
		src_pp_m = (__m128i*)src_p;
		p_p = rp_p;
		if (gamma) {
			for (x = 0; x < eights; ++x) {
				pixels = _mm_loadu_si128(src_pp_m++);
				fpixels = _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle1));
				_mm_store_ps((float*)p_p++, _mm_mul_ps(fpixels, fpixels));

				fpixels = _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle2));
				_mm_store_ps((float*)p_p++, _mm_mul_ps(fpixels, fpixels));
			}
		} else {
			for (x = 0; x < eights; ++x) {
				pixels = _mm_loadu_si128(src_pp_m++);
				_mm_store_ps((float*)p_p++, _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle1)));

				_mm_store_ps((float*)p_p++, _mm_cvtepi32_ps(_mm_shuffle_epi8(pixels, shuffle2)));
			}
		}

		src_pp_w = (uint16_t*)src_pp_m;
		float* fp_p = (float*)p_p;

		for (x = 0; x < ones; ++x) {
			if (gamma) {
				g = *src_pp_w++;
				*fp_p++ = (float)g*g;
			} else {
				*fp_p++ = (float)*src_pp_w++;
			}
		}

		float extra = fp_p[-1];
		for (x = 0; x < extras; ++x) {
			*fp_p++ = extra;
		}

		src_p += pitch;
		rp_p += levels[0].m128_pitch();
	}
}

void Pyramid::CopyPlanarThread_32bit(__m128* src_p, int pitch, bool gamma, int sy, int ey) {
	if (short_data) throw("Invalid copy for short data pyramid");
	int x, y;

	pitch >>= 2;

	__m128* rp_p = (__m128*)levels[0].data;

	src_p += sy * pitch;
	rp_p += sy * levels[0].m128_pitch();

	if (gamma) {
		int fours = (levels[0].width + 3) >> 2;

		for (y = sy; y < ey; ++y) {
			for (x = 0; x < fours; ++x) {
				__m128 pixels = _mm_load_ps((float*)&src_p[x]);
				_mm_store_ps((float*)&rp_p[x], _mm_mul_ps(pixels, pixels));
			}
			float copy = ((float*)rp_p)[levels[0].width - 1];
			for (x = levels[0].width; x < levels[0].pitch; ++x) ((float*)rp_p)[x] = copy;
			src_p += pitch;
			rp_p += levels[0].m128_pitch();
		}
	} else {
		int copy_bytes = levels[0].width << 2;

		for (y = sy; y < ey; ++y) {
			memcpy(rp_p, src_p, copy_bytes);
			float copy = ((float*)rp_p)[levels[0].width - 1];
			for (x = levels[0].width; x < levels[0].pitch; ++x) ((float*)rp_p)[x] = copy;
			src_p += pitch;
			rp_p += levels[0].m128_pitch();
		}
	}
}

#define Z(X) 0x80 | X

void Pyramid::CopyPlanarThread_Short(uint8_t* src_p, int pitch, int sy, int ey) {
	__m256i permute = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
	__m256i shuffle1 = _mm256_setr_epi16(Z(0), Z(1), Z(2), Z(3), Z(4), Z(5), Z(6), Z(7), Z(0), Z(1), Z(2), Z(3), Z(4), Z(5), Z(6), Z(7));
	__m256i shuffle2 = _mm256_setr_epi16(Z(8), Z(9), Z(10), Z(11), Z(12), Z(13), Z(14), Z(7), Z(0), Z(1), Z(2), Z(3), Z(4), Z(5), Z(6), Z(7));

	uint16_t* data = (uint16_t*)levels[0].data + sy * pitch;
	auto input = src_p + sy * pitch;

	int thirtytwos = levels[0].width >> 5;
	thirtytwos = 0;
//	int rem = levels[0].width - thirtytwos;

	int x;
	
	for (int y = sy; y < ey; ++y) {
//		for (x = 0; x < thirtytwos; ++x) {
//			__m256i in = _mm256_loadu_si256(((__m256i*)input) + x);
//			in = _mm256_permutevar8x32_epi32(in, permute);
//			_mm256_store_si256(((__m256i*)data) + (x << 1), _mm256_srli_epi16(_mm256_shuffle_epi8(in, shuffle1), 3));
//			_mm256_store_si256(((__m256i*)data) + (x << 1) + 1, _mm256_srli_epi16(_mm256_shuffle_epi8(in, shuffle2), 3));
//		}
		for (x = 0; x < levels[0].width; ++x) {
			data[x] = input[x] << 3;
		}
		for (; x < levels[0].pitch; ++x) {
			data[x] = data[levels[0].width - 1];
		}
		data += pitch;
		input += pitch;
	}
}

/***********************************************************************
* subsample
***********************************************************************/
void Pyramid::Subsample(int sub_w, int sub_h, Pyramid* source) {
	int x, y;
	int p = 0;
	__m128* in = (__m128*)source->levels[0].data;
	__m128* out = (__m128*)levels[0].data;
	__m128* temp_line = (__m128*)source->temp_lines[0][0]; // lines
	__m128* line = temp_line;
	int m128_pitch_in = source->levels[0].m128_pitch();
	int m128_pitch_out = levels[0].m128_pitch();
	int mid_pitch = sub_w == 2 ? ((m128_pitch_in >> 1) + 1)&~1 : m128_pitch_in;
	__m128 three = _mm_set1_ps(3);
	__m128 four = _mm_set1_ps(4);
	__m128 mul = _mm_set1_ps((sub_h ? 1.0f / 8 : 1.0f)*(sub_w == 2 ? 1.0f / 64 : 1.0f / 8));

	for (y = 0; y < levels[0].height; ++y) {
		if (sub_h) {
			if (y == 0) {
				for (x = 0; x < m128_pitch_in; ++x) {
					_mm_store_ps((float*)&temp_line[x], _mm_add_ps(_mm_mul_ps(_mm_load_ps((float*)&in[x]), four), _mm_add_ps(_mm_mul_ps(_mm_load_ps((float*)&in[x + m128_pitch_in]), three), _mm_load_ps((float*)&in[x + (m128_pitch_in << 1)]))));
				}
			} else if (y == levels[0].height - 1) {
				for (x = 0; x < m128_pitch_in; ++x) {
					_mm_store_ps((float*)&temp_line[x], _mm_add_ps(_mm_mul_ps(_mm_load_ps((float*)&in[x + m128_pitch_in]), four), _mm_add_ps(_mm_mul_ps(_mm_load_ps((float*)&in[x]), three), _mm_load_ps((float*)&in[x - m128_pitch_in]))));
				}
			} else {
				for (x = 0; x < m128_pitch_in; ++x) {
					_mm_store_ps(
						(float*)&temp_line[x],
						_mm_add_ps(
							_mm_add_ps(_mm_load_ps((float*)&in[x - m128_pitch_in]), _mm_load_ps((float*)&in[x + (m128_pitch_in << 1)])),
							_mm_mul_ps(_mm_add_ps(_mm_load_ps((float*)&in[x]), _mm_load_ps((float*)&in[x + m128_pitch_in])), three)
						)
					);
				}
			}
			in += m128_pitch_in << 1;
		} else {
			line = (__m128*)in;
			in += m128_pitch_in;
		}
		switch (sub_w) {
			case 2: Subsample_Squeeze(line, line, m128_pitch_in, mid_pitch, NULL);
			case 1: Subsample_Squeeze(line, out, mid_pitch, m128_pitch_out, &mul);
		}
		out += m128_pitch_out;
	}
}

void Pyramid::Subsample_Squeeze(__m128* in, __m128* Out, int m128_pitch_in, int m128_pitch_out, __m128* mul) {
	int read = 0;

	int x;
	__m128 a, b, c, d, e, f;
	__m128 three = _mm_set1_ps(3);

	b = _mm_load_ps((float*)&in[read++]);
//	a = _MM_SHUFFLE_PS1(b, b, _MM_SHUFFLE(0, 0, 0, 0));
	a = _mm_permute_ps(b, _MM_SHUFFLE(0, 0, 0, 0));
	c = _mm_load_ps((float*)&in[read++]);
	d = _mm_load_ps((float*)&in[read++]);

	for (x = 0; x < m128_pitch_out; ++x) {
		e = _mm_shuffle_ps(a, c, _MM_SHUFFLE(0, 0, 3, 3));
		f = _mm_shuffle_ps(b, d, _MM_SHUFFLE(0, 0, 3, 3));
		e = _mm_blend_ps(b, e, 9);
		f = _mm_blend_ps(c, f, 9);
//		e = _MM_SHUFFLE_PS1(e, e, _MM_SHUFFLE(3, 1, 2, 0));
//		f = _MM_SHUFFLE_PS1(f, f, _MM_SHUFFLE(3, 1, 2, 0));
		e = _mm_permute_ps(e, _MM_SHUFFLE(3, 1, 2, 0));
		f = _mm_permute_ps(f, _MM_SHUFFLE(3, 1, 2, 0));
		e = _mm_hadd_ps(e, f);

		f = _mm_hadd_ps(b, c);
		if (mul) {
			_mm_store_ps((float*)&Out[x], _mm_mul_ps(_mm_add_ps(e, _mm_mul_ps(f, three)), *mul));
		} else {
			_mm_store_ps((float*)&Out[x], _mm_add_ps(e, _mm_mul_ps(f, three)));
		}
		a = c;
		b = d;
		if (read < m128_pitch_in - 1) {
			c = _mm_load_ps((float*)&in[read++]);
			d = _mm_load_ps((float*)&in[read++]);
		} else {
			if (read < m128_pitch_in) {
				c = _mm_load_ps((float*)&in[read++]);
				d = _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3));
			} else {
				c = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 3, 3));
				d = c;
			}
		}
	}
}

/***********************************************************************
* shrink (gaussian)
***********************************************************************/
void Pyramid::Shrink() {
	int l;

	for (l = 0; l < (int)levels.size() - 1; ++l) {
		int height_odd = (levels[l].height & 1) ^ levels[l].y_shift;
		int first_bad_line = levels[l + 1].height - (3 - height_odd); // 2023-02-27 - double-checked; this seems correct

		for (int t = 0; t < (int)levels[l + 1].bands.size() - 1; ++t) {
			if (short_data) {
				threadpool->Queue([=] { ShrinkThread_Short((int16_t*)temp_lines[t][0], (int16_t*)levels[l].data, (int16_t*)levels[l + 1].data, levels[l].width, levels[l + 1].width, levels[l].pitch, levels[l + 1].pitch, first_bad_line, height_odd, levels[l + 1].bands[t], levels[l + 1].bands[t + 1], levels[l].x_shift, levels[l].y_shift); });
			} else {
				threadpool->Queue([=] { ShrinkThread((__m128*)temp_lines[t][0], (__m128*)levels[l].data, (__m128*)levels[l + 1].data, levels[l].m128_pitch(), levels[l + 1].m128_pitch(), first_bad_line, height_odd, levels[l + 1].bands[t], levels[l + 1].bands[t + 1], levels[l].x_shift, levels[l].y_shift); });
			}
		}

		threadpool->Wait();
	}
}

void Pyramid::ShrinkThread(__m128* line, __m128* hi, __m128* lo, int m128_pitch_hi, int m128_pitch_lo, int first_bad_line, int height_odd, int sy, int ey, const bool x_shift, const bool y_shift) {
	int x, y;

	const __m128 four = _mm_set1_ps(4);
	const __m128 six = _mm_set1_ps(6);
	const __m128 eleven = _mm_set1_ps(11);
	const __m128 fifteen = _mm_set1_ps(15);
	const __m128 _16th = _mm_set1_ps(1.0 / 16);
	const __m128 _256th = _mm_set1_ps(1.0 / 256);

	// line 0
	if (sy == 0) {
		memcpy(line, hi, m128_pitch_hi << 4); // must copy because of x_shift
		Squeeze(line, lo, m128_pitch_lo, m128_pitch_hi, _16th, x_shift);
		lo += m128_pitch_lo;

		if (!y_shift) {
		// line 1
			for (x = 0; x < m128_pitch_hi; ++x) {
				line[x] = _mm_add_ps(_mm_add_ps(_mm_mul_ps(hi[x], eleven), _mm_mul_ps(hi[x + m128_pitch_hi], four)), hi[x + (m128_pitch_hi << 1)]);
			}
			Squeeze(line, lo, m128_pitch_lo, m128_pitch_hi, _256th, x_shift);
			lo += m128_pitch_lo;
			hi += (m128_pitch_hi << 1);
			sy = 2;
		} else {
		// line 1
			for (x = 0; x < m128_pitch_hi; ++x) {
				line[x] = _mm_add_ps(_mm_mul_ps(hi[x], fifteen), hi[x + m128_pitch_hi]);
			}
			Squeeze(line, lo, m128_pitch_lo, m128_pitch_hi, _256th, x_shift);
			lo += m128_pitch_lo;
			hi += m128_pitch_hi;
		// line 2
			for (x = 0; x < m128_pitch_hi; ++x) {
				_mm_store_ps((float*)&line[x], _mm_add_ps(_mm_add_ps(
					_mm_add_ps(hi[x - m128_pitch_hi], hi[x + (m128_pitch_hi << 1)]),
					_mm_mul_ps(_mm_add_ps(hi[x - m128_pitch_hi], hi[x + m128_pitch_hi]), four)
				),
					_mm_mul_ps(hi[x], six)
				));
			}
			Squeeze(line, lo, m128_pitch_lo, m128_pitch_hi, _256th, x_shift);
			lo += m128_pitch_lo;
			hi += (m128_pitch_hi << 1);
			sy = 3;
		}
	} else {
		lo += m128_pitch_lo * sy;
		hi += m128_pitch_hi * (((sy - 1) << 1) - y_shift); // because hi has a missing line compared to lo, lo's line 0 corresponds to hi's line -2, 1=>0, 2=>2, 3=>4
	}

	ey = (std::min)(first_bad_line, ey);

	// good lines
	for (y = sy; y < ey; ++y) { // was y < first_bad_line
		for (x = 0; x < m128_pitch_hi; ++x) {
			_mm_store_ps((float*)&line[x], _mm_add_ps(_mm_add_ps(
				_mm_add_ps(hi[x - (m128_pitch_hi << 1)], hi[x + (m128_pitch_hi << 1)]),
				_mm_mul_ps(_mm_add_ps(hi[x - m128_pitch_hi], hi[x + m128_pitch_hi]), four)
				),
				_mm_mul_ps(hi[x], six)
				));
		}
		Squeeze(line, lo, m128_pitch_lo, m128_pitch_hi, _256th, x_shift);
		lo += m128_pitch_lo;
		hi += (m128_pitch_hi << 1);
	}

	if (y == first_bad_line) { // final block
		// prepenultimate line
		if (!height_odd) {
			for (x = 0; x < m128_pitch_hi; ++x) {
				line[x] = _mm_add_ps(
					_mm_add_ps(
					_mm_add_ps(hi[x - (m128_pitch_hi << 1)], hi[x + m128_pitch_hi]),
					_mm_mul_ps(_mm_add_ps(hi[x - m128_pitch_hi], hi[x + m128_pitch_hi]), four)
					),
					_mm_mul_ps(hi[x], six)
					);
			}

			Squeeze(line, lo, m128_pitch_lo, m128_pitch_hi, _256th, x_shift);
			++y;
			lo += m128_pitch_lo;
			hi += (m128_pitch_hi << 1);

			// this case moved from block below
			for (x = 0; x < m128_pitch_hi; ++x) {
				line[x] = _mm_add_ps(hi[x - (m128_pitch_hi << 1)], _mm_mul_ps(hi[x - m128_pitch_hi], fifteen));
			}
		} else {
		// penultimate line
			for (x = 0; x < m128_pitch_hi; ++x) {
				line[x] = _mm_add_ps(
					_mm_add_ps(
					_mm_mul_ps(hi[x], eleven),
					_mm_mul_ps(hi[x - m128_pitch_hi], four)
					),
					hi[x - (m128_pitch_hi << 1)]
					);
			}
			// other case removed from here, moved into block above
		}

		Squeeze(line, lo, m128_pitch_lo, m128_pitch_hi, _256th, x_shift);
		++y;
		lo += m128_pitch_lo;
		hi += (m128_pitch_hi << 1);

		// last line
		hi -= m128_pitch_hi*(3 - height_odd);
		memcpy(line, hi, m128_pitch_hi << 4);
		Squeeze(line, lo, m128_pitch_lo, m128_pitch_hi, _16th, x_shift);
		++y;
	}
}

void Pyramid::Squeeze(__m128* line, __m128* lo, int m128_pitch_lo, int m128_pitch_hi, __m128 final_mul, bool x_shift) {
	int hi_x = 0;
	int lo_x = 0;



	/*


	float* line_f = (float*)line;
	float* lo_f = (float*)lo;

	int width_hi = m128_pitch_lo << 2;
	int width_lo = m128_pitch_hi << 2;

	lo_f[lo_x++] = line_f[hi_x];
	lo_f[lo_x++] = line_f[hi_x] * 11 + line_f[hi_x + 1] * 4 + line_f[hi_x + 2];
	hi_x += 2;
	while (hi_x < width_hi - 2) {
		lo_f[lo_x++] = line_f[hi_x - 2] + (line_f[hi_x - 1] + line_f[hi_x + 1]) * 4 + line_f[hi_x]*6 + line_f[hi_x + 2];
		hi_x += 2;
	}


	return;
	*/




	__m128 a, b, c, d, e, f, g, h, i, j;

	const __m128 four = _mm_set1_ps(4);
	const __m128 six = _mm_set1_ps(6);

	if (x_shift) memmove(&((float*)line)[1], line, (m128_pitch_hi << 4) - 4);

	while (lo_x < m128_pitch_lo) {
		if (hi_x >= m128_pitch_hi) { // was >= ... + 1
//			b = _MM_SHUFFLE_PS1(a, a, _MM_SHUFFLE(3, 3, 3, 3));
			b = _mm_permute_ps(a, _MM_SHUFFLE(3, 3, 3, 3));
			c = b;
		} else {
			b = line[hi_x++];
			c = line[hi_x++];
//			if (lo_x == 0) a = _MM_SHUFFLE_PS1(b, b, _MM_SHUFFLE(0, 0, 0, 0));
			if (lo_x == 0) a = _mm_permute_ps(b, _MM_SHUFFLE(0, 0, 0, 0));
		}

		// a = EFGH
		// b = IJKL
		// c = MNOP

		// shuffle to four pairs of outer pixels
		f = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); // EGIK
		g = _mm_shuffle_ps(b, c, _MM_SHUFFLE(2, 0, 2, 0)); // IKMO
//		d = _MM_SHUFFLE_PS1(f, f, _MM_SHUFFLE(3, 1, 2, 0)); // EIGK
//		e = _MM_SHUFFLE_PS1(g, g, _MM_SHUFFLE(3, 1, 2, 0)); // IMKO
		d = _mm_permute_ps(f, _MM_SHUFFLE(3, 1, 2, 0)); // EIGK
		e = _mm_permute_ps(g, _MM_SHUFFLE(3, 1, 2, 0)); // IMKO
		d = _mm_hadd_ps(d, e);

		// shuffle to four pairs of inner pixels
		h = _mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 1, 3, 1)); // FHJJ // prev BDFF (should be BDDF)
		i = _mm_shuffle_ps(b, c, _MM_SHUFFLE(1, 1, 3, 1)); // JLNN // prev FHJJ (should be FHHJ)
//		h = _MM_SHUFFLE_PS1(h, h, _MM_SHUFFLE(3, 1, 1, 0)); // FHHJ
//		i = _MM_SHUFFLE_PS1(i, i, _MM_SHUFFLE(3, 1, 1, 0)); // JLLN
		h = _mm_permute_ps(h, _MM_SHUFFLE(3, 1, 1, 0)); // FHHJ
		i = _mm_permute_ps(i, _MM_SHUFFLE(3, 1, 1, 0)); // JLLN
		h = _mm_mul_ps(four, _mm_hadd_ps(h, i));

		// shuffle to four central pixels
		j = _mm_mul_ps(six, _mm_shuffle_ps(f, g, _MM_SHUFFLE(2, 1, 2, 1))); // GIKM

		// store
		_mm_store_ps((float*)&lo[lo_x++], _mm_mul_ps(_mm_add_ps(d, _mm_add_ps(h, j)), final_mul));
		a = c;
	}
}

// 16-bit fixed point

void Pyramid::ShrinkThread_Short(int16_t* line, int16_t* hi, int16_t* lo, int width_hi, int width_lo, int pitch_hi, int pitch_lo, int first_bad_line, int height_odd, int sy, int ey, const bool x_shift, const bool y_shift) {
	int m256_pitch_hi = pitch_hi >> 4;
	auto four = _mm256_set1_epi16(4);

	if (sy == 0) {
//		memcpy(line, hi, pitch_hi << 1);
		Squeeze_Short(hi, lo, width_hi, pitch_lo, x_shift);
		lo += pitch_lo;
		++sy;

		if (y_shift) {
			ShrinkBlendLines_epi16(
				(__m256i*)hi,
				(__m256i*)hi,
				(__m256i*)hi,
				(__m256i*)hi,
				((__m256i*)hi) + m256_pitch_hi,
				(__m256i*)line,
				m256_pitch_hi
			);

			Squeeze_Short(line, lo, width_hi, pitch_lo, x_shift);
			lo += pitch_lo;
			++sy;

			hi += pitch_hi;

//-X|XX
			ShrinkBlendLines_epi16(
				((__m256i*)hi) - m256_pitch_hi,
				((__m256i*)hi) - m256_pitch_hi,
				(__m256i*)hi,
				((__m256i*)hi) + m256_pitch_hi,
				((__m256i*)hi) + (m256_pitch_hi << 1),
				(__m256i*)line,
				m256_pitch_hi
			);

			Squeeze_Short(line, lo, width_hi, pitch_lo, x_shift);
			lo += pitch_lo;
			++sy;

			hi += pitch_hi << 1;
		} else {
			ShrinkBlendLines_epi16(
				(__m256i*)hi,
				(__m256i*)hi,
				(__m256i*)hi,
				((__m256i*)hi) + m256_pitch_hi,
				((__m256i*)hi) + (m256_pitch_hi << 1),
				(__m256i*)line,
				m256_pitch_hi
			);

			Squeeze_Short(line, lo, width_hi, pitch_lo, x_shift);
			lo += pitch_lo;
			++sy;

			hi += pitch_hi << 1;
		}
	} else {
		lo += pitch_lo * sy;
		hi += pitch_hi * (((sy - 1) << 1) - y_shift);
	}

	int o_ey = (std::min)(first_bad_line, ey);

	while (sy < o_ey) {
		ShrinkBlendLines_epi16(
			((__m256i*)hi) - (m256_pitch_hi << 1),
			((__m256i*)hi) - m256_pitch_hi,
			(__m256i*)hi,
			((__m256i*)hi) + m256_pitch_hi,
			((__m256i*)hi) + (m256_pitch_hi << 1),
			(__m256i*)line,
			m256_pitch_hi
		);

		Squeeze_Short(line, lo, width_hi, pitch_lo, x_shift);
		lo += pitch_lo;
		hi += pitch_hi << 1;
		++sy;
	}

	if (sy == first_bad_line) {
//		printf("%d\n", sy);
		if (height_odd) {
			ShrinkBlendLines_epi16(
				((__m256i*)hi) - (m256_pitch_hi << 1),
				((__m256i*)hi) - m256_pitch_hi,
				(__m256i*)hi,
				(__m256i*)hi,
				(__m256i*)hi,
				(__m256i*)line,
				m256_pitch_hi
			);

			Squeeze_Short(line, lo, width_hi, pitch_lo, x_shift);
			lo += pitch_lo;
			++sy;

			Squeeze_Short(hi, lo, width_hi, pitch_lo, x_shift);
			lo += pitch_lo;
			++sy;
		} else {
			ShrinkBlendLines_epi16(
				((__m256i*)hi) - (m256_pitch_hi << 1),
				((__m256i*)hi) - m256_pitch_hi,
				(__m256i*)hi,
				((__m256i*)hi) + m256_pitch_hi,
				((__m256i*)hi) + m256_pitch_hi,
				(__m256i*)line,
				m256_pitch_hi
			);

			Squeeze_Short(line, lo, width_hi, pitch_lo, x_shift);
			lo += pitch_lo;
			hi += pitch_hi << 1;
			++sy;

			ShrinkBlendLines_epi16(
				((__m256i*)hi) - (m256_pitch_hi << 1),
				((__m256i*)hi) - m256_pitch_hi,
				((__m256i*)hi) - m256_pitch_hi,
				((__m256i*)hi) - m256_pitch_hi,
				((__m256i*)hi) - m256_pitch_hi,
				(__m256i*)line,
				m256_pitch_hi
			);

			Squeeze_Short(line, lo, width_hi, pitch_lo, x_shift);
			lo += pitch_lo;
			++sy;

			Squeeze_Short(hi - pitch_hi, lo, width_hi, pitch_lo, x_shift);
			lo += pitch_lo;
			++sy;
		}

		while (sy < ey) {
//			memcpy(lo, lo - pitch_lo, pitch_lo << 1);
			++sy;
		}
	}
}

void Pyramid::Squeeze_Short(int16_t* input, int16_t* output, int width_hi, int pitch_lo, bool x_shift) {
	int hi_x = 0;
	int lo_x = 0;

	output[lo_x++] = input[hi_x];

	if (!x_shift) {
		output[lo_x++] = (input[hi_x] * 11 + input[hi_x + 1] * 4 + input[hi_x + 2] + 8) >> 4;
		hi_x = 2;
	} else {
		output[lo_x++] = (input[hi_x] * 15 + input[hi_x + 1] + 8) >> 4;
		output[lo_x++] = (input[hi_x] + (input[hi_x] + input[hi_x + 2]) * 4 + input[hi_x + 3] + input[hi_x + 1] * 6 + 8) >> 4;
		hi_x = 3;
	}

	while (hi_x < width_hi - 2) {
		output[lo_x++] = (input[hi_x - 2] + input[hi_x + 2] + (input[hi_x - 1] + input[hi_x + 1]) * 4 + input[hi_x] * 6 + 8) >> 4;
		hi_x += 2;
	}

	if (hi_x == width_hi - 2) {
		output[lo_x++] = (input[hi_x - 2] + input[hi_x + 1] + (input[hi_x - 1] + input[hi_x + 1]) * 4 + input[hi_x] * 6 + 8) >> 4;
		hi_x += 2;
		output[lo_x++] = (input[hi_x - 2] + input[hi_x - 1] * 15 + 8) >> 4;
	} else {
		output[lo_x++] = (input[hi_x - 2] + input[hi_x - 1] * 4 + input[hi_x] * 11 + 8) >> 4;
	}

	while (lo_x < pitch_lo) output[lo_x++] = input[width_hi - 1];
}

void Pyramid::ShrinkBlendLines_epi16(__m256i* a, __m256i* b, __m256i* c, __m256i* d, __m256i* e, __m256i* output, int m256_pitch) {
	auto eight = _mm256_set1_epi16(8); // was four, for some reason (2023-05-21)

	for (int x = 0; x < m256_pitch; ++x) {
		output[x] =
			_mm256_srli_epi16(
				_mm256_add_epi16(
					_mm256_add_epi16(
						_mm256_add_epi16(
							_mm256_add_epi16(
								_mm256_slli_epi16(c[x], 2),
								_mm256_slli_epi16(c[x], 1)
							),
							_mm256_slli_epi16(_mm256_add_epi16(b[x], d[x]), 2)
						),
						_mm256_add_epi16(a[x], e[x])
					),
					eight
				),
				4
			);
	}
}

/***********************************************************************
* laplace (gaussian)
***********************************************************************/
void Pyramid::LaplaceCollapse(int n_levels, bool collapse) {
	int j, l;

	for (j = 0; j < n_levels - 1; ++j) {
		if (collapse) l = (n_levels - 2) - j; else l = j;

		for (int t = 0; t < (int)levels[l].bands.size() - 1; ++t) {
			if (short_data) {
				threadpool->Queue([=] { LaplaceThread_Short(t, &levels[l], &levels[l + 1], levels[l].bands[t], levels[l].bands[t + 1]); });
			} else {
				threadpool->Queue([=] { LaplaceThread(t, &levels[l], &levels[l + 1], levels[l].bands[t], levels[l].bands[t + 1]); });
			}
		}

		threadpool->Wait();
	}
}

void Pyramid::LaplaceThread(int t, Level* upper_level, Level* lower_level, int sy, int ey) {
	__m128* hi = (__m128*)upper_level->data + sy * upper_level->m128_pitch();

	int lo_y = sy >> 1;

	bool two = upper_level->y_shift; // was (sy + upper_level->y_shift)&1) but sy is always even

	if (two) {
		++lo_y;
		LaplaceExpand((__m128*)temp_lines[t][1], lo_y++, lower_level, upper_level->x_shift, upper_level->m128_pitch(), lower_level->m128_pitch());
		LaplaceExpand((__m128*)temp_lines[t][2], lo_y++, lower_level, upper_level->x_shift, upper_level->m128_pitch(), lower_level->m128_pitch());
	} else {
		LaplaceExpand((__m128*)temp_lines[t][0], lo_y++, lower_level, upper_level->x_shift, upper_level->m128_pitch(), lower_level->m128_pitch());
		LaplaceExpand((__m128*)temp_lines[t][1], lo_y++, lower_level, upper_level->x_shift, upper_level->m128_pitch(), lower_level->m128_pitch());
	}

	for (int y = sy; y < ey; ++y) {
		if (two) {
			LaplaceLine2(hi, (__m128*)temp_lines[t][1], (__m128*)temp_lines[t][2], upper_level->m128_pitch());

			auto temp = (__m128*)temp_lines[t][0];
			temp_lines[t][0] = temp_lines[t][1];
			temp_lines[t][1] = temp_lines[t][2];
			temp_lines[t][2] = temp;
		} else {
			LaplaceExpand((__m128*)temp_lines[t][2], lo_y++, lower_level, upper_level->x_shift, upper_level->m128_pitch(), lower_level->m128_pitch());

			LaplaceLine3(hi, (__m128*)temp_lines[t][0], (__m128*)temp_lines[t][1], (__m128*)temp_lines[t][2], upper_level->m128_pitch());
		}

		hi += upper_level->m128_pitch();
		two = !two;
	}
}

__m128* Pyramid::Level::GetLine(int y) {
	return (__m128*)((float*)data + y * pitch);
}

void Pyramid::LaplaceExpand(__m128* temp, int y, Level* lower_level, bool upper_x_shift, int m128_pitch_hi, int m128_pitch_lo) {
	if (upper_x_shift) LaplaceExpandShifted(temp, lower_level->GetLine(y), m128_pitch_hi, m128_pitch_lo); else LaplaceExpandUnshifted(temp, lower_level->GetLine(y), m128_pitch_hi, m128_pitch_lo);
}

void Pyramid::LaplaceExpandUnshifted(__m128* hi, __m128* lo, int m128_pitch_hi, int m128_pitch_lo) {
	__m128 p, q;

	const __m128 expand0 = _mm_set_ps(0, 0.125f, 0.75f, 0.125f);
	const __m128 expand1 = _mm_set_ps(0, 0.5f, 0.5f, 0);
	const __m128 expand2 = _mm_set_ps(0.125, 0.75f, 0.125f, 0);
	const __m128 expand3 = _mm_set_ps(0.5f, 0.5f, 0, 0);

	int x_hi = 0;
	int x_lo = 0;

	p = _mm_load_ps((float*)lo);
	++x_lo;

	while (x_hi < m128_pitch_hi) {
		_mm_store_ps(
			(float*)&hi[x_hi],
			_mm_hadd_ps(
				_mm_hadd_ps(
					_mm_mul_ps(p, expand0),
					_mm_mul_ps(p, expand1)
				),
				_mm_hadd_ps(
					_mm_mul_ps(p, expand2),
					_mm_mul_ps(p, expand3)
				)
			)
		);

		++x_hi;

		if (x_lo < m128_pitch_lo) {
			q = lo[x_lo]; // _mm_load_ps((float*)&lo[x_lo]);
		} else if (x_lo == m128_pitch_lo) {
			q = _mm_permute_ps(q, _MM_SHUFFLE(3, 3, 3, 3));
		}
		++x_lo;
		p = _mm_shuffle_ps(p, q, _MM_SHUFFLE(1, 0, 3, 2));

		_mm_store_ps(
			(float*)&hi[x_hi],
			_mm_hadd_ps(
				_mm_hadd_ps(
					_mm_mul_ps(p, expand0),
					_mm_mul_ps(p, expand1)
				),
				_mm_hadd_ps(
					_mm_mul_ps(p, expand2),
					_mm_mul_ps(p, expand3)
				)
			)
		);

		p = q;

		++x_hi;
	}
}

void Pyramid::LaplaceExpandShifted(__m128* hi, __m128* lo, int m128_pitch_hi, int m128_pitch_lo) {
	__m128 p, q, t;

	const __m128 expand0 = _mm_set_ps(0, 0.5f, 0.5f, 0);
	const __m128 expand1 = _mm_set_ps(0.125, 0.75f, 0.125f, 0);
	const __m128 expand2 = _mm_set_ps(0.5f, 0.5f, 0, 0);
	const __m128 expand3 = _mm_set_ps(0, 0.125f, 0.75f, 0.125f);

	int x_hi = 0;
	int x_lo = 0;

	p = _mm_load_ps((float*)lo);
	++x_lo;

	while (x_hi < m128_pitch_hi) {
		t = p;

		if (x_lo < m128_pitch_lo) {
			q = lo[x_lo];
		} else if (x_lo == m128_pitch_lo) {
			q = _mm_permute_ps(q, _MM_SHUFFLE(3, 3, 3, 3));
		}
		++x_lo;
		p = _mm_shuffle_ps(p, q, _MM_SHUFFLE(1, 0, 3, 2));

		_mm_store_ps(
			(float*)&hi[x_hi],
			_mm_hadd_ps(
				_mm_hadd_ps(
					_mm_mul_ps(t, expand0),
					_mm_mul_ps(t, expand1)
				),
				_mm_hadd_ps(
					_mm_mul_ps(t, expand2),
					_mm_mul_ps(p, expand3)
				)
			)
		);

		++x_hi;

		_mm_store_ps(
			(float*)&hi[x_hi],
			_mm_hadd_ps(
				_mm_hadd_ps(
					_mm_mul_ps(p, expand0),
					_mm_mul_ps(p, expand1)
				),
				_mm_hadd_ps(
					_mm_mul_ps(p, expand2),
					_mm_mul_ps(q, expand3)
				)
			)
		);

		p = q;

		++x_hi;
	}
}

void Pyramid::LaplaceLine2(__m128* hi, __m128* temp1, __m128* temp2, int m128_pitch) {
	static const __m128 half = _mm_set1_ps(0.5f);

	for (int x = 0; x < m128_pitch; ++x) {
		_mm_store_ps((float*)&hi[x],
			_mm_sub_ps(
				_mm_mul_ps(
					_mm_add_ps(temp1[x], temp2[x]),
					half
				),
				hi[x]
			)
		);
	}
}

void Pyramid::LaplaceLine3(__m128* hi, __m128* temp1, __m128* temp2, __m128* temp3, int m128_pitch) {
	static const __m128 eighth = _mm_set1_ps(0.125f);
	static const __m128 three_quarters = _mm_set1_ps(0.75f);

	for (int x = 0; x < m128_pitch; ++x) {
		_mm_store_ps((float*)&hi[x],
			_mm_sub_ps(
				_mm_add_ps(
					_mm_mul_ps(
						_mm_add_ps(temp1[x], temp3[x]),
						eighth
					),
					_mm_mul_ps(
						temp2[x],
						three_quarters
					)
				),
				hi[x]
			)
		);
	}
}

// 16-bit fixed point

void Pyramid::LaplaceThread_Short(int t, Level* upper_level, Level* lower_level, int sy, int ey) {
	int16_t* hi = (int16_t*)upper_level->data + sy * upper_level->pitch;
	int16_t* lo = (int16_t*)lower_level->data + (sy >> 1) * lower_level->pitch;
	int16_t* temp = (int16_t*)temp_lines[t][0];

	int m256_pitch_hi = upper_level->pitch >> 4;
	int m256_pitch_lo = lower_level->pitch >> 4;

	auto one = _mm256_set1_epi16(1);
	lo += lower_level->pitch;

	bool two = upper_level->y_shift;

	while (sy < ey) {
		if (two) {
			for (int x = 0; x < m256_pitch_lo; ++x) {
				_mm256_store_si256(&((__m256i*)temp)[x],
					_mm256_slli_epi16(
						_mm256_add_epi16(((__m256i*)lo)[x], ((__m256i*)lo)[x + m256_pitch_lo]),
						2
					)
				);
			}
			lo += lower_level->pitch;
		} else {
			for (int x = 0; x < m256_pitch_lo; ++x) {
				auto a = ((__m256i*)lo)[x];

				_mm256_store_si256(&((__m256i*)temp)[x],
					_mm256_add_epi16(
						_mm256_add_epi16(((__m256i*)lo)[x - m256_pitch_lo], ((__m256i*)lo)[x + m256_pitch_lo]),
						_mm256_add_epi16(
							_mm256_slli_epi16(a, 1),
							_mm256_slli_epi16(a, 2)
						)
					)
				);
			}
		}

		bool x_two = upper_level->x_shift;

		int hi_x = 0;
		int lo_x = 1;

		while (hi_x < upper_level->width) {
			hi[hi_x] = x_two ? ((temp[lo_x] + temp[lo_x + 1] + 8) >> 4) - hi[hi_x] : ((temp[lo_x - 1] + temp[lo_x] * 6 + temp[lo_x + 1] + 32) >> 6) - hi[hi_x];
			lo_x += x_two;
			x_two = !x_two;
			++hi_x;
		}

		int temp = hi[hi_x - 1];

		while (hi_x < upper_level->pitch) {
			hi[hi_x++] = temp;
		}

		++sy;
		two = !two;
		hi += upper_level->pitch;
	}
}

/*******************************************************************************
* New Laplace (not working? 2023-05-14)
*******************************************************************************/
// note: sy is always at least even (doubly even because of FastBlur SSE requirements)
void Pyramid::LaplaceThreadNew(int t, Level* upper_level, Level* lower_level, int sy, int ey) {
	static const __m128 half = _mm_set1_ps(0.5f);
	static const __m128 eighth = _mm_set1_ps(0.125f);
	static const __m128 three_quarters = _mm_set1_ps(0.75f);

	__m128* hi = (__m128*)upper_level->data + sy * upper_level->m128_pitch();

	int lo_y = sy >> 1;

	__m128* temp0 = NULL;
	__m128* temp1 = NULL;
	__m128* temp2 = NULL;

	bool two = upper_level->y_shift; // was ((sy + upper_level->y_shift) & 1); but sy is always even

	if (two) {
		++lo_y;
		temp1 = lower_level->GetLine(lo_y++);
		temp2 = lower_level->GetLine(lo_y++);
//		lower_level->ExpandLine(temp_lines[t][1], lo_y++);
//		lower_level->ExpandLine(temp_lines[t][2], lo_y++);
	} else {
		temp0 = lower_level->GetLine(lo_y++);
		temp1 = lower_level->GetLine(lo_y++);
//		lower_level->ExpandLine(temp_lines[t][0], lo_y++);
//		lower_level->ExpandLine(temp_lines[t][1], lo_y++);
	}

	for (int y = sy; y < ey; ++y) {
		if (two) {
			if (upper_level->x_shift) {
				LaplaceExpandLineAndSubShifted(hi, NULL, temp1, temp2, upper_level->m128_pitch(), lower_level->m128_pitch());
			} else {
				LaplaceExpandLineAndSub(hi, NULL, temp1, temp2, upper_level->m128_pitch(), lower_level->m128_pitch());
			}

			__m128* temp = temp0;
			temp0 = temp1;
			temp1 = temp2;
			temp2 = temp;
		} else {
			temp2 = lower_level->GetLine(lo_y++);
			if (upper_level->x_shift) {
				LaplaceExpandLineAndSubShifted(hi, temp0, temp1, temp2, upper_level->m128_pitch(), lower_level->m128_pitch());
			} else {
				LaplaceExpandLineAndSub(hi, temp0, temp1, temp2, upper_level->m128_pitch(), lower_level->m128_pitch());
			}
		}

		hi += upper_level->m128_pitch();
		two = !two;
	}
}

void Pyramid::LaplaceExpandLineAndSub(__m128* hi, __m128* temp0, __m128* temp1, __m128* temp2, int m128_pitch_hi, int m128_pitch_lo) {
	__m128 p, q;

	const __m128 half = _mm_set1_ps(0.5f);
	const __m128 eighth = _mm_set1_ps(0.125f);
	const __m128 three_quarters = _mm_set1_ps(0.75f);

	const __m128 expand0 = _mm_set_ps(0, 0.125f, 0.75f, 0.125f);
	const __m128 expand1 = _mm_set_ps(0, 0.5f, 0.5f, 0);
	const __m128 expand2 = _mm_set_ps(0.125, 0.75f, 0.125f, 0);
	const __m128 expand3 = _mm_set_ps(0.5f, 0.5f, 0, 0);

	int x_lo = 0;

	for (int x_hi = 0; x_hi < m128_pitch_hi; ++x_hi) {
		if (x_hi == 0 || (x_hi & 1)) {
			if (x_lo < m128_pitch_lo) {
				if (temp0) {
					q = _mm_add_ps(
						_mm_mul_ps(
							_mm_add_ps(
								temp0[x_lo],
								temp2[x_lo]
							),
							eighth
						),
						_mm_mul_ps(
							temp1[x_lo],
							three_quarters
						)
					);
				} else {
					q = _mm_mul_ps(
						_mm_add_ps(temp1[x_lo], temp2[x_lo]),
						half
					);
				}
			} else if (x_lo == m128_pitch_lo) {
				q = _mm_permute_ps(q, _MM_SHUFFLE(3, 3, 3, 3));
			}
			++x_lo;
			if (x_hi == 0) std::swap(p, q);
		}

		if (x_hi & 1) p = _mm_shuffle_ps(p, q, _MM_SHUFFLE(1, 0, 3, 2));

		_mm_store_ps(
			(float*)&hi[x_hi],
			_mm_sub_ps(
				_mm_hadd_ps(
					_mm_hadd_ps(
						_mm_mul_ps(p, expand0),
						_mm_mul_ps(p, expand1)
					),
					_mm_hadd_ps(
						_mm_mul_ps(p, expand2),
						_mm_mul_ps(p, expand3)
					)
				),
				hi[x_hi]
			)
		);

		if (x_hi & 1) p = q;
	}
	/*
	int x_hi = 0;

	p = _mm_load_ps((float*)lo);
	++x_lo;

	while (x_hi < m128_pitch_hi) {
		_mm_store_ps(
			(float*)&hi[x_hi],
			_mm_hadd_ps(
				_mm_hadd_ps(
					_mm_mul_ps(p, expand0),
					_mm_mul_ps(p, expand1)
				),
				_mm_hadd_ps(
					_mm_mul_ps(p, expand2),
					_mm_mul_ps(p, expand3)
				)
			)
		);

		++x_hi;

		if (x_lo < m128_pitch_lo) {
			q = lo[x_lo]; // _mm_load_ps((float*)&lo[x_lo]);
		} else if (x_lo == m128_pitch_lo) {
			q = _mm_permute_ps(q, _MM_SHUFFLE(3, 3, 3, 3));
		}
		++x_lo;
		p = _mm_shuffle_ps(p, q, _MM_SHUFFLE(1, 0, 3, 2));

		_mm_store_ps(
			(float*)&hi[x_hi],
			_mm_hadd_ps(
				_mm_hadd_ps(
					_mm_mul_ps(p, expand0),
					_mm_mul_ps(p, expand1)
				),
				_mm_hadd_ps(
					_mm_mul_ps(p, expand2),
					_mm_mul_ps(p, expand3)
				)
			)
		);

		p = q;

		++x_hi;
	}
	*/
}

void Pyramid::LaplaceExpandLineAndSubShifted(__m128* hi, __m128* temp0, __m128* temp1, __m128* temp2, int m128_pitch_hi, int m128_pitch_lo) {
	__m128 p, q, t;

	const __m128 half = _mm_set1_ps(0.5f);
	const __m128 eighth = _mm_set1_ps(0.125f);
	const __m128 three_quarters = _mm_set1_ps(0.75f);

	const __m128 expand0 = _mm_set_ps(0, 0.5f, 0.5f, 0);
	const __m128 expand1 = _mm_set_ps(0.125, 0.75f, 0.125f, 0);
	const __m128 expand2 = _mm_set_ps(0.5f, 0.5f, 0, 0);
	const __m128 expand3 = _mm_set_ps(0, 0.125f, 0.75f, 0.125f);

	int x_lo = 0;

	for (int x_hi = 0; x_hi < m128_pitch_hi; ++x_hi) {
		if (!(x_hi & 0)) {
			t = p;

			if (x_lo < m128_pitch_lo) {
				if (temp0) {
					q = _mm_add_ps(
						_mm_mul_ps(
							_mm_add_ps(
								temp0[x_lo],
								temp2[x_lo]
							), eighth),
						_mm_mul_ps(
							temp1[x_lo],
							three_quarters
						)
					);
				} else {
					q = _mm_mul_ps(
						_mm_add_ps(temp1[x_lo], temp2[x_lo]),
						half
					);
				}
			} else if (x_lo == m128_pitch_lo) {
				q = _mm_permute_ps(q, _MM_SHUFFLE(3, 3, 3, 3));
			}
			++x_lo;
			p = _mm_shuffle_ps(p, q, _MM_SHUFFLE(1, 0, 3, 2));
		} else {
			p = q;
		}

		_mm_store_ps(
			(float*)&hi[x_hi],
			_mm_sub_ps(
				_mm_hadd_ps(
					_mm_hadd_ps(
						_mm_mul_ps(t, expand0),
						_mm_mul_ps(t, expand1)
					),
					_mm_hadd_ps(
						_mm_mul_ps(t, expand2),
						_mm_mul_ps(p, expand3)
					)
				),
				hi[x_hi]
			)
		);
	}
/*
	int x_hi = 0;

	p = _mm_load_ps((float*)lo);
	++x_lo;

	while (x_hi < m128_pitch_hi) {
		t = p;

		if (x_lo < m128_pitch_lo) {
			q = lo[x_lo];
		} else if (x_lo == m128_pitch_lo) {
			q = _mm_permute_ps(q, _MM_SHUFFLE(3, 3, 3, 3));
		}
		++x_lo;
		p = _mm_shuffle_ps(p, q, _MM_SHUFFLE(1, 0, 3, 2));

		_mm_store_ps(
			(float*)&hi[x_hi],
			_mm_hadd_ps(
				_mm_hadd_ps(
					_mm_mul_ps(t, expand0),
					_mm_mul_ps(t, expand1)
				),
				_mm_hadd_ps(
					_mm_mul_ps(t, expand2),
					_mm_mul_ps(p, expand3)
				)
			)
		);

		++x_hi;

		_mm_store_ps(
			(float*)&hi[x_hi],
			_mm_hadd_ps(
				_mm_hadd_ps(
					_mm_mul_ps(p, expand0),
					_mm_mul_ps(p, expand1)
				),
				_mm_hadd_ps(
					_mm_mul_ps(p, expand2),
					_mm_mul_ps(q, expand3)
				)
			)
		);

		p = q;

		++x_hi;
	}
	*/
}

/***********************************************************************
* Average (top level)
***********************************************************************/
float Pyramid::Average() {
	int x, y;
	int fours = levels[0].width >> 2;

	__m128 m128_total = { 0 };
	__m128 one = _mm_set1_ps(1.0f);
	double total = 0;
	double row_total;

	__m128* data = (__m128*)levels[0].data;

	for (y = 0; y < levels[0].height; ++y) {
		m128_total = _mm_setzero_ps();

		for (x = 0; x < fours; ++x) {
			m128_total = _mm_add_ps(m128_total, data[x]);
		}

		m128_total = _mm_hadd_ps(m128_total, m128_total);
		m128_total = _mm_hadd_ps(m128_total, m128_total);
		row_total = _mm_cvtss_f32(m128_total);

		for (x <<= 2; x < levels[0].width; ++x) {
			row_total += ((float*)data)[x];
		}

		total += row_total;

		data += levels[0].m128_pitch();
	}

	total /= levels[0].width;
	total /= levels[0].height;

	return (float)total;
}

/***********************************************************************
* Add
***********************************************************************/
void Pyramid::Add(float add, int _levels) {
	__m128 __add = _mm_set1_ps(add);

	int lim = (std::min)(_levels, (int)levels.size() - 1);

	for (int l = 0; l < lim; ++l) {
		__m128* data = (__m128*)levels[l].data;

		for (int t = 0; t < (int)levels[l].bands.size() - 1; ++t) {
			threadpool->Queue([=]() {
				__m128* data = (__m128*)levels[l].data + levels[l].bands[t] * levels[l].m128_pitch();
				for (int y = levels[l].bands[t]; y < levels[l].bands[t + 1]; ++y) {
					for (int x = 0; x < levels[l].m128_pitch(); ++x) {
						_mm_store_ps((float*)&data[x], _mm_add_ps(__add, data[x]));
					}
					data += levels[l].m128_pitch();
				}
			});
		}

		threadpool->Wait();
	}
}

/***********************************************************************
* Multiply and add
***********************************************************************/
void Pyramid::MultiplyAndAdd(float add, float mul, int _levels) {
	int i;
	int x, y;
	__m128 __add = _mm_set1_ps(add);
	__m128 __mul = _mm_set1_ps(mul);

	if (_levels < 0) _levels = (int)levels.size() + _levels;
	int lim = (std::min)(_levels, (int)levels.size());

	for (i = 0; i < lim; ++i) {
		__m128* data = (__m128*)levels[i].data;

		for (y = 0; y < levels[i].height; ++y) {
			for (x = 0; x < levels[i].m128_pitch(); ++x) {
				_mm_store_ps((float*)&data[x], _mm_add_ps(__add, _mm_mul_ps(data[x], __mul)));
			}
			data += levels[i].m128_pitch();
		}
	}
}

/***********************************************************************
* Multiply, add, clamp
***********************************************************************/
void Pyramid::MultiplyAddClamp(float add, float mul, int level) {
	int x, y;
	__m128 __add = _mm_set1_ps(add);
	__m128 __mul = _mm_set1_ps(mul);
	__m128 __min = _mm_set1_ps(1.0f);
	__m128 __max = _mm_set1_ps(0.0f);

	__m128* data = (__m128*)levels[level].data;

	for (y = 0; y < levels[level].height; ++y) {
		for (x = 0; x < levels[level].m128_pitch(); ++x) {
			_mm_store_ps((float*)&data[x], _mm_max_ps(_mm_min_ps(_mm_add_ps(__add, _mm_mul_ps(data[x], __mul)), __min), __max));
		}
		data += levels[level].m128_pitch();
	}
}

/***********************************************************************
* Multiply
***********************************************************************/
void Pyramid::Multiply(int level, float mul) {
	if (mul == 1) return;
	if (mul == 0) {
		memset(levels[level].data, 0, levels[level].height * levels[level].pitch * sizeof(float));
//		ZeroMemory(levels[level].data, levels[level].height*levels[level].pitch*sizeof(float));
		return;
	}

	int x, y;
	__m128 __mul = _mm_set1_ps(mul);

	__m128* data = (__m128*)levels[level].data;

	for (y = 0; y < levels[level].height; ++y) {
		for (x = 0; x < levels[level].m128_pitch(); ++x) {
			_mm_store_ps((float*)&data[x], _mm_mul_ps(_mm_load_ps((float*)&data[x]), __mul));
		}
		data += levels[level].m128_pitch();
	}
}

/***********************************************************************
* multiply_by_pyramid
***********************************************************************/
void Pyramid::MultplyByPyramid(Pyramid* b) {
	int x, y;

	for (int l = 0; l < (int)levels.size() - 1; ++l) {
		__m128* data = (__m128*)levels[l].data;
		__m128* _b = (__m128*)b->levels[l].data;

		for (y = 0; y < levels[l].height; ++y) {
			for (x = 0; x < levels[l].m128_pitch(); ++x) {
				_mm_store_ps((float*)&data[x], _mm_mul_ps(_mm_load_ps((float*)&data[x]), _mm_load_ps((float*)&_b[x])));
			}
			data += levels[l].m128_pitch();
			_b += levels[l].m128_pitch();
		}
	}
}

/***********************************************************************
* blend
***********************************************************************/
void Pyramid::Fuse(Pyramid* _b, Pyramid* mask, bool pre = false, int black = 0x00) {
	int l;

	for (l = 0; l < (int)levels.size(); ++l) {

//		fuse_thread((__m128*)levels[l].data, (__m128*)_b->levels[l].data, (__m128*)mask->levels[l].data, m128_pitch, 0, levels[l].height, pre, black);

// fuse doesn't see any gains from multithreading; leave this here as reference

		for (int t = 0; t < (int)levels[l].bands.size() - 1; ++t) {
			threadpool->Queue([=]{ FuseThread((__m128*)levels[l].data, (__m128*)_b->levels[l].data, (__m128*)mask->levels[l].data, levels[l].m128_pitch(), levels[l].bands[t], levels[l].bands[t + 1], pre, black); });
		}
		threadpool->Wait();
	}
}

void Pyramid::FuseThread(__m128* a, __m128* b, __m128* m, int m128_pitch, int sy, int ey, bool pre, int black) {
	int p;
	int add = sy*m128_pitch;
	int count = (ey - sy)*m128_pitch;

	a += add;
	b += add;
	m += add;

	if (!pre) {
		for (p = 0; p < count; ++p) {
			__m128 _a = a[p];
			_mm_store_ps((float*)&a[p], _mm_add_ps(_a, _mm_mul_ps(_mm_sub_ps(b[p], _a), m[p])));
		}
	} else {
		__m128 ones = _mm_set1_ps(1.0f);
		__m128 blacks = _mm_set1_ps((float)black);
		if (black) {
			for (p = 0; p < count; ++p) {
				_mm_store_ps((float*)&a[p], _mm_add_ps(blacks, _mm_add_ps(_mm_sub_ps(b[p], blacks), _mm_mul_ps(_mm_sub_ps(a[p], blacks), _mm_sub_ps(ones, m[p])))));
			}
		} else {
			for (p = 0; p < count; ++p) {
				_mm_store_ps((float*)&a[p], _mm_add_ps(b[p], _mm_mul_ps(a[p], _mm_sub_ps(ones, m[p]))));
			}
		}
	}
}

void Pyramid::Fuse(Pyramid* b, float weight) {
	int l;
	int p;
	__m128 w = _mm_set1_ps(weight);

	for (l = 0; l < (int)levels.size(); ++l) {
		__m128* _a = (__m128*)levels[l].data;
		__m128* _b = (__m128*)b->levels[l].data;

		int count = levels[l].height*levels[l].m128_pitch();
		for (p = 0; p < count; ++p) {
			__m128 __a = _a[p];
			_mm_store_ps((float*)&_a[p], _mm_add_ps(__a, _mm_mul_ps(_mm_sub_ps(_b[p], __a), w)));
		}
	}
}

/***********************************************************************
* denoise
***********************************************************************/
#ifdef PYR_DENOISE
void Pyramid::Denoise(int level, float power, bool gamma) {
	if (power == 0) return;

	int x, y;
	__m128 one = _mm_set1_ps(1);
	__m128 half = _mm_set1_ps(0.5f);
	__m128 _power = _mm_set1_ps(power);
	__m128 pi = _mm_set1_ps(3.14159265359f);
	__m128i andi = _mm_set_epi32(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff);
	__m128* _and = (__m128*)&andi;

	if (gamma) _power = _mm_mul_ps(_power, _power);
	_power = _mm_div_ps(one, _power);

	__m128* data = (__m128*)levels[level].data;
	for (y = 0; y < levels[level].height; ++y) {
		for (x = 0; x < levels[level].m128_pitch(); ++x) {
			__m128 d = data[x];
			_mm_store_ps((float*)&data[x], _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(one, cos_ps(_mm_min_ps(_mm_and_ps(_mm_mul_ps(d, _power), *_and), pi))), half), d));
		}
		data += levels[level].m128_pitch();
	}
}
#endif

/***********************************************************************
* blend (base swap)
***********************************************************************/
void Pyramid::Blend(Pyramid* b) {
	if (b->GetNLevels() < GetNLevels()) return;
	memcpy(levels[GetNLevels() - 1].data, b->levels[GetNLevels() - 1].data, levels[GetNLevels() - 1].height * levels[GetNLevels() - 1].pitch * sizeof(float));
}

/***********************************************************************
* approximate gaussian blur
***********************************************************************/
#define BLUR_SSE_GET(y, x) y = _mm_set_ps(line3[x], line2[x], line1[x], line0[x])
//#define BLUR_SSE_GET2(x) _mm_load_ps((float*)&transposed[x])
#define BLUR_SSE_GET_LEFT temp1 = _mm_set_ps(line3[left], line2[left], line1[left], line0[left]); left++;
#define BLUR_SSE_GET_RIGHT temp2 = _mm_set_ps(line3[right], line2[right], line1[right], line0[right]); right++;

void Pyramid::BlurXTranspose(float radius, Pyramid* transpose) {
	for (int i = 0; i < (int)levels[0].bands.size() - 1; ++i) {
		threadpool->Queue([=] { BlurXTransposeThread(radius, transpose, levels[0].bands[i], levels[0].bands[i + 1]); });
	}
	threadpool->Wait();
}

void Pyramid::BlurXTransposeThread(float radius, Pyramid* transpose, int sy, int ey) {
	int x, y;
	int i;
	int o;
	float* line0 = (float*)levels[0].data + sy * levels[0].pitch;
	float* line1 = line0 + levels[0].pitch;
	float* line2 = line1 + levels[0].pitch;
	float* line3 = line2 + levels[0].pitch;
	float* out = (float*)transpose->levels[0].data + sy;
	__m128 temp1, temp2;

	int iradius = (int)floor(radius);
	__m128 irp1 = _mm_set1_ps((float)(iradius + 1));
	__m128 mul = _mm_set1_ps(radius - iradius);
	__m128 acc;

	int left, right;

	int fours = (ey - sy + 3) >> 2; // +3 is probably not necessary because all bands are mod 4

	if (iradius < levels[0].width >> 1) {
		for (y = 0; y < fours; ++y) {
			acc = _mm_setzero_ps();
			left = 0;

			BLUR_SSE_GET_LEFT;

			acc = _mm_mul_ps(temp1, irp1);
			for (right = 1; right < iradius + 1;) {
				BLUR_SSE_GET_RIGHT;
				acc = _mm_add_ps(acc, temp2);
			}

			x = 0;
			o = 0;
			right = iradius + 1;

			for (i = 0; i <= iradius; ++i) {
				BLUR_SSE_GET_RIGHT;
				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
				o += transpose->levels[0].pitch;
				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);
				++x;
			}

			while (right < levels[0].width) {
				BLUR_SSE_GET_RIGHT;
				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
				o += transpose->levels[0].pitch;
				BLUR_SSE_GET_LEFT;
				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);
				++x;
			}

			while (x < levels[0].width) {
				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
				o += transpose->levels[0].pitch;
				BLUR_SSE_GET_LEFT;
				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);
				++x;
			}

			line0 += levels[0].pitch << 2;
			line1 += levels[0].pitch << 2;
			line2 += levels[0].pitch << 2;
			line3 += levels[0].pitch << 2;
			out += 4;
		}
	} else {
// if radius is wider than image
		for (y = 0; y < fours; ++y) {
			acc = _mm_setzero_ps();

			BLUR_SSE_GET(temp1, 0);
			acc = _mm_mul_ps(temp1, irp1);
			right = 1;
			for (x = 1; x < iradius + 1; ++x) {
				if (right < levels[0].width) {
					BLUR_SSE_GET(temp2, right);
					++right;
				}
				acc = _mm_add_ps(acc, temp2);
			}

			x = 0;
			o = 0;
			left = -iradius;

			for (x = 0; x < levels[0].width; ++x) {
				if (right < levels[0].width) {
					BLUR_SSE_GET(temp2, right);
					++right;
				}
				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
				o += transpose->levels[0].pitch;
				if (left>0) BLUR_SSE_GET(temp1, left);
				++left;
				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);
			}

			line0 += levels[0].pitch << 2;
			line1 += levels[0].pitch << 2;
			line2 += levels[0].pitch << 2;
			line3 += levels[0].pitch << 2;
			out += 4;
		}
	}
}

/***********************************************************************
* Gaussian blur (2)
***********************************************************************/
void Pyramid::BlurX(Pyramid* out, float radius) {
	for (int i = 0; i < (int)levels[0].bands.size() - 1; ++i) threadpool->Queue([=] { BlurXThreadNonSSE(out, radius, levels[0].bands[i], levels[0].bands[i + 1]); });
	threadpool->Wait();
}

void Pyramid::BlurXThread(Pyramid* out_pyramid, float radius, int sy, int ey) {
	int x, y;
	int i;
	int o;
	float* line0 = (float*)levels[0].data + sy * levels[0].pitch;
	float* line1 = line0 + levels[0].pitch;
	float* line2 = line1 + levels[0].pitch;
	float* line3 = line2 + levels[0].pitch;
	float* out_line0 = (float*)out_pyramid->levels[0].data + sy * out_pyramid->levels[0].pitch;
	float* out_line1 = out_line0 + out_pyramid->levels[0].pitch;
	float* out_line2 = out_line1 + out_pyramid->levels[0].pitch;
	float* out_line3 = out_line2 + out_pyramid->levels[0].pitch;
	__m128 cols[4];
	__m128 temp1, temp2;

	int iradius = (int)floor(radius);
	__m128 irp1 = _mm_set1_ps((float)(iradius + 1));
	__m128 mul = _mm_set1_ps(radius - iradius);
	__m128 acc;

	int left, right;

	int fours = (ey - sy + 3) >> 2; // +3 is probably not necessary because all bands are mod 4

	if (iradius < levels[0].width >> 1) {
		for (y = 0; y < fours; ++y) {
			acc = _mm_setzero_ps();
			left = 0;

			BLUR_SSE_GET_LEFT;

			acc = _mm_mul_ps(temp1, irp1);
			for (right = 1; right < iradius + 1;) {
				BLUR_SSE_GET_RIGHT;
				acc = _mm_add_ps(acc, temp2);
			}

			x = 0;
			o = 0;
			right = iradius + 1;

			for (i = 0; i <= iradius; ++i) {
				BLUR_SSE_GET_RIGHT;
//				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
//				o += transpose->levels[0].pitch;
				cols[x & 3] = _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul));
				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);
				++x;
				if (!(x & 3)) {
					_MM_TRANSPOSE4_PS(cols[0], cols[1], cols[2], cols[3]);
					_mm_store_ps(&out_line0[x & ~3], cols[0]);
					_mm_store_ps(&out_line1[x & ~3], cols[1]);
					_mm_store_ps(&out_line2[x & ~3], cols[2]);
					_mm_store_ps(&out_line3[x & ~3], cols[3]);
				}
			}

			while (right < levels[0].width) {
				BLUR_SSE_GET_RIGHT;
//				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
//				o += transpose->levels[0].pitch;
				cols[x & 3] = _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul));
				BLUR_SSE_GET_LEFT;
				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);
				++x;
				if (!(x & 3)) {
					_MM_TRANSPOSE4_PS(cols[0], cols[1], cols[2], cols[3]);
					_mm_store_ps(&out_line0[x & ~3], cols[0]);
					_mm_store_ps(&out_line1[x & ~3], cols[1]);
					_mm_store_ps(&out_line2[x & ~3], cols[2]);
					_mm_store_ps(&out_line3[x & ~3], cols[3]);
				}
			}

			while (x < levels[0].width) {
//				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
//				o += transpose->levels[0].pitch;
				cols[x & 3] = _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul));
				BLUR_SSE_GET_LEFT;
				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);
				++x;
				if (!(x & 3)) {
					_MM_TRANSPOSE4_PS(cols[0], cols[1], cols[2], cols[3]);
					_mm_store_ps(&out_line0[x & ~3], cols[0]);
					_mm_store_ps(&out_line1[x & ~3], cols[1]);
					_mm_store_ps(&out_line2[x & ~3], cols[2]);
					_mm_store_ps(&out_line3[x & ~3], cols[3]);
				}
			}

			line0 += levels[0].pitch << 2;
			line1 += levels[0].pitch << 2;
			line2 += levels[0].pitch << 2;
			line3 += levels[0].pitch << 2;
			out_line0 += out_pyramid->levels[0].pitch << 2;
			out_line1 += out_pyramid->levels[0].pitch << 2;
			out_line2 += out_pyramid->levels[0].pitch << 2;
			out_line3 += out_pyramid->levels[0].pitch << 2;
//			out += 4;
		}
	} else {
// if radius is wider than image
		for (y = 0; y < fours; ++y) {
			acc = _mm_setzero_ps();

			BLUR_SSE_GET(temp1, 0);
			acc = _mm_mul_ps(temp1, irp1);
			right = 1;
			for (x = 1; x < iradius + 1; ++x) {
				if (right < levels[0].width) {
					BLUR_SSE_GET(temp2, right);
					++right;
				}
				acc = _mm_add_ps(acc, temp2);
			}

			x = 0;
			o = 0;
			left = -iradius;

			while (x < levels[0].width) {
				if (right < levels[0].width) {
					BLUR_SSE_GET(temp2, right);
					++right;
				}
				cols[x & 3] = _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul));

//				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
//				o += transpose->levels[0].pitch;
				if (left > 0) BLUR_SSE_GET(temp1, left);
				++left;
				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);

				++x;
				if (!(x & 3)) {
					_MM_TRANSPOSE4_PS(cols[0], cols[1], cols[2], cols[3]);
					_mm_store_ps(&line0[x & ~3], cols[0]);
					_mm_store_ps(&line1[x & ~3], cols[1]);
					_mm_store_ps(&line2[x & ~3], cols[2]);
					_mm_store_ps(&line3[x & ~3], cols[3]);
				}
			}

			line0 += levels[0].pitch << 2;
			line1 += levels[0].pitch << 2;
			line2 += levels[0].pitch << 2;
			line3 += levels[0].pitch << 2;
			out_line0 += out_pyramid->levels[0].pitch << 2;
			out_line1 += out_pyramid->levels[0].pitch << 2;
			out_line2 += out_pyramid->levels[0].pitch << 2;
			out_line3 += out_pyramid->levels[0].pitch << 2;
		}
	}
}

void Pyramid::BlurXThreadNonSSE(Pyramid* out_pyramid, float radius, int sy, int ey) {
	int x, y;
	int i;
	int o;
	float* line0 = (float*)levels[0].data + sy * levels[0].pitch;
	float* temp = new float[levels[0].pitch];
//	float* out_line0 = out_pyramid->levels[0].data + sy * out_pyramid->levels[0].pitch;
//	__m128 cols[4];
	float temp1, temp2;

	int iradius = (int)floor(radius);
	int irp1 = iradius + 1;
	float mul = radius - iradius;
	float acc;

	int left, right;

//	int fours = (ey - sy + 3) >> 2; // +3 is probably not necessary because all bands are mod 4

	if (iradius < levels[0].width >> 1) {
		for (y = sy; y < ey; ++y) {
			acc = 0;
			left = 0;

//			BLUR_SSE_GET_LEFT;
			temp1 = line0[left++];

			acc = temp1 * irp1;
			for (right = 1; right < iradius + 1;) {
				temp2 = line0[right++];
				acc += temp2;
			}

			x = 0;
			o = 0;
			right = iradius + 1;

			for (i = 0; i <= iradius; ++i) {
				temp2 = line0[right++];
//				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
//				o += transpose->levels[0].pitch;
				temp[x++] = acc + (temp1 + temp2) * mul;
//				cols[x & 3] = _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul
				acc += temp2 - temp1;
//				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);
//				++x;
//				if (!(x & 3)) {
//					_MM_TRANSPOSE4_PS(cols[0], cols[1], cols[2], cols[3]);
//					_mm_store_ps(&out_line0[x & ~3], cols[0]);
//					_mm_store_ps(&out_line1[x & ~3], cols[1]);
//					_mm_store_ps(&out_line2[x & ~3], cols[2]);
//					_mm_store_ps(&out_line3[x & ~3], cols[3]);
//				}
			}

			while (right < levels[0].width) {
				temp2 = line0[right++];
//				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
//				o += transpose->levels[0].pitch;
				temp[x++] = acc + (temp1 + temp2) * mul;
				temp1 = line0[left++];
				acc += temp2 - temp1;
//				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);
//				++x;
//				if (!(x & 3)) {
//					_MM_TRANSPOSE4_PS(cols[0], cols[1], cols[2], cols[3]);
//					_mm_store_ps(&out_line0[x & ~3], cols[0]);
//					_mm_store_ps(&out_line1[x & ~3], cols[1]);
//					_mm_store_ps(&out_line2[x & ~3], cols[2]);
//					_mm_store_ps(&out_line3[x & ~3], cols[3]);
//				}
			}

			while (x < levels[0].width) {
//				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
//				o += transpose->levels[0].pitch;
				temp[x++] = acc + (temp1 + temp2) * mul;

//				cols[x & 3] = _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul));
//				BLUR_SSE_GET_LEFT;
				temp1 = line0[left++];
				acc += temp2 - temp1;

//				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);
//				++x;
//				if (!(x & 3)) {
//					_MM_TRANSPOSE4_PS(cols[0], cols[1], cols[2], cols[3]);
//					_mm_store_ps(&out_line0[x & ~3], cols[0]);
//					_mm_store_ps(&out_line1[x & ~3], cols[1]);
//					_mm_store_ps(&out_line2[x & ~3], cols[2]);
//					_mm_store_ps(&out_line3[x & ~3], cols[3]);
//				}
			}

			memcpy(line0, temp, levels[0].pitch << 2);

			line0 += levels[0].pitch;
//			line1 += levels[0].pitch << 2;
//			line2 += levels[0].pitch << 2;
//			line3 += levels[0].pitch << 2;
//			out_line0 += out_pyramid->levels[0].pitch;
//			out_line1 += out_pyramid->levels[0].pitch << 2;
//			out_line2 += out_pyramid->levels[0].pitch << 2;
//			out_line3 += out_pyramid->levels[0].pitch << 2;
//			out += 4;
		}

		delete[] temp;
	} else {
// if radius is wider than image
		for (y = sy; y < ey; ++y) {
			acc = 0;

//			BLUR_SSE_GET(temp1, 0);
			temp1 = line0[0];
			acc = temp1 * irp1;
			right = 1;
			for (x = 1; x < iradius + 1; ++x) {
				if (right < levels[0].width) temp2 = line0[right++];
				acc += temp2;
			}

			x = 0;
			o = 0;
			left = -iradius;

			while (x < levels[0].width) {
				if (right < levels[0].width) temp2 = line0[right++];
				temp[x] = acc + (temp1 + temp2) * mul;
//				cols[x & 3] = _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul));

//				_mm_store_ps(&out[o], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(temp1, temp2), mul)));
//				o += transpose->levels[0].pitch;
				if (left > 0) temp1 = line0[left];
				++left;
				acc += (temp2 - temp1);
//				acc = _mm_add_ps(_mm_sub_ps(temp2, temp1), acc);

//				++x;
//				if (!(x & 3)) {
//					_MM_TRANSPOSE4_PS(cols[0], cols[1], cols[2], cols[3]);
//					_mm_store_ps(&line0[x & ~3], cols[0]);
//					_mm_store_ps(&line1[x & ~3], cols[1]);
//					_mm_store_ps(&line2[x & ~3], cols[2]);
//					_mm_store_ps(&line3[x & ~3], cols[3]);
//				}
			}

			memcpy(line0, temp, levels[0].pitch << 2);
			line0 += levels[0].pitch;
//			out_line0 += out_pyramid->levels[0].pitch;
		}
	}
}


/*
void Pyramid::BlurXThread(Pyramid* out, float radius, int sy, int ey) {
	__m128 acc;
	int fours = (levels[0].width + 3) >> 2;

	float* in_p = levels[0].data + sy * levels[0].pitch;
	float* out_p = out->levels[0].data + sy * out->levels[0].pitch;
	int iradius = (int)floor(radius);
	__m128 mul = _mm_set1_ps(radius - iradius);
	int irp1 = iradius + 1;

	for (int y = sy; y < ey; ++y) {
		float acc1_f = irp1 * in_p[0];
		for (int x = 1; x <= iradius; ++x) {
			acc1_f += in_p[(std::min)(x, levels[0].width - 1)];
		}

		acc.m128_f32[0] = acc1_f;
		for (int i = 1; i < 8; ++i) {
			acc1_f += -in_p[(std::max)(0, i - irp1)] + in_p[(std::min)(i + iradius, levels[0].width - 1)];
			acc.m128_f32[i] = acc1_f;
		}

		__m128 left;
		if (iradius >= 2) {
			left = _mm_load_ps1(in_p);
		} else {
			left = _mm_load_ps(in_p);
			switch (iradius) {
				case 0: left = _MM_SHUFFLE_PS1(left, left, _MM_SHUFFLE(2, 1, 0, 0)); break;
				case 1: left = _MM_SHUFFLE_PS1(left, left, _MM_SHUFFLE(1, 0, 0, 0)); break;
			}
		}

		__m128 right;
		int diff = levels[0].width - irp1;
		if (diff >= 4) { // first member of "right" will be in[irp1]
			right = _mm_loadu_ps(&in_p[irp1]);
		} else {
			if (diff <= 1) {
				right = _mm_load_ps1(&in_p[levels[0].width - 1]);
			} else {
				_mm_loadu_ps(&in_p[levels[0].width - 4]); // load last four pixels
				switch (diff) {
					case 2: left = _mm_shuffle_ps(left, left, _MM_SHUFFLE(3, 2, 2, 2)); break;
					case 3: left = _mm_shuffle_ps(left, left, _MM_SHUFFLE(3, 2, 1, 1)); break;
				}
			}
		}

		for (int x = 0; x < fours; x+=4) {
			_mm_store_ps(&out_p[x], _mm_add_ps(acc, _mm_mul_ps(_mm_add_ps(left, right), mul)));
		}

		in_p += levels[0].pitch;
		out_p += out->levels[0].pitch;
	}
}
*/

void Pyramid::BlurY(Pyramid* out, float radius) {
}

void Pyramid::BlurYThread(Pyramid* out, float radius, int sy, int ey) {
}

#ifdef PNGER
/***********************************************************************
* PNG debug
***********************************************************************/
void Pyramid::Png(const char* filename) {
	int width = levels[0].pitch;
	int height = levels[0].height + (levels.size()>1 ? 1 + levels[1].height : 0);
	uint8_t* temp = (uint8_t*)calloc(width * height, 1);

	int px = 0, py = 0;

	for (int l = 0; l < (int)levels.size(); ++l) {
		float* data = (float*)levels[l].data;
		uint8_t* line = temp + py * levels[0].pitch + px;
		for (int y = 0; y < levels[l].height; ++y) {
			for (int x = 0; x < levels[l].pitch; ++x) {
				int f = (int)floor(data[x] + 0.5);
				line[x] = std::max(0, std::min(255, f));
			}
			line += levels[0].pitch;
			data += levels[l].pitch;
		}
		if (l & 1) px += levels[l].pitch + 1; else py += levels[l].height + 1;
	}

	Pnger::Quick((char*)filename, temp, width, height, width, PNG_COLOR_TYPE_GRAY);

	free(temp);
}
#endif

void Pyramid::Pgm(std::string filename, int shift) {
	int width = levels[0].pitch;
	int height = levels[0].height + (levels.size() > 1 ? 1 + levels[1].height : 0);
	uint8_t* temp = new uint8_t[width * height];

	int px = 0, py = 0;
	union {
		float* f;
		int16_t* s;
		void* v;
	} data;

	for (int l = 0; l < (int)levels.size(); ++l) {
		data.v = levels[l].data;
		uint8_t* line = temp + py * levels[0].pitch + px;
		for (int y = 0; y < levels[l].height; ++y) {
			for (int x = 0; x < levels[l].pitch; ++x) {
				int f = short_data ? abs(data.s[x] >> shift) : (int)floor(data.f[x] + 0.5f);
				line[x] = (std::max)(0, (std::min)(255, f));
			}
			line += levels[0].pitch;
			if (short_data) data.s += levels[l].pitch; else data.f += levels[l].pitch;
		}
		if (l & 1) px += levels[l].pitch + 1; else py += levels[l].height + 1;
	}

	std::ofstream out_file(filename, std::ios::binary);
	out_file << "P5\n " << width << "\n" << height << "\n255\n";
	out_file.write((char*)temp, width * height);

	delete[] temp;
}
