#version 450
#extension GL_GOOGLE_include_directive : enable

#include "d3d9_convert_common.h"

layout(
  local_size_x = 8,
  local_size_y = 8,
  local_size_z = 1) in;

layout(binding = 0)
writeonly uniform image2D dst;

layout(binding = 1) uniform usamplerBuffer src;

layout(push_constant)
uniform u_info_t {
  uvec2 extent;
} u_info;

vec2 fetchUnorm2x8(usamplerBuffer source, uint offset) {
  return unpackUnorm2x8(texelFetch(src, int(offset)).r);
}

// Format is:
// YYYYYYYYYYYYYYY...
// YYYYYYYYYYYYYYY...
// UVUVUVUVUVUVUVU...

void main() {
  ivec3 thread_id = ivec3(gl_GlobalInvocationID);

  if (all(lessThan(thread_id.xy, u_info.extent))) {
    uvec2 pitch = uvec2(u_info.extent.x, u_info.extent.y);

    uint offset = thread_id.x
                + thread_id.y * pitch.x;

    // Fetch 2 luminance samples.
    vec2 y = fetchUnorm2x8(src, offset) - (16 / 255.0);        

    // Go into the second plane to get the chroma data.
    // UV data is subsampled as [2, 2]
    // So we need to divide thread_id.y by 2.
    // thread_id.x is already accounted for as we read uint16
    offset = thread_id.x
           + thread_id.y / 2 * pitch.x
           + pitch.x * pitch.y;

    vec2 uv = fetchUnorm2x8(src, offset) - (128 / 255.0);

    // The NV12 format seems to use the BT.709 color space.
    vec4 color0 = convertBT_709(vec3(y.x, uv.x, uv.y));
    vec4 color1 = convertBT_709(vec3(y.y, uv.x, uv.y));

    // We write as a macropixel of [2, 1]
    // So write out 2 pixels in this run.
    ivec2 writePos = thread_id.xy * ivec2(2, 1);
    
    imageStore(dst, ivec2(writePos.x,     writePos.y), color0);
    imageStore(dst, ivec2(writePos.x + 1, writePos.y), color1);
  }
}