#include <SPI.h>
#include "luts.h"

#define DIN 11
#define CS  12
#define CLK 13
#define X_SEGMENTS   4
#define Y_SEGMENTS   4
#define NUM_SEGMENTS (X_SEGMENTS * Y_SEGMENTS)

// a framebuffer to hold the state of the entire matrix of LEDs
// laid out in raster order, with (0, 0) at the top-left
byte fb[8 * NUM_SEGMENTS];

void setup() {
  Serial.begin(2000000);
  pinMode(CLK, OUTPUT);
  pinMode(DIN, OUTPUT);
  pinMode(CS, OUTPUT);
  SPI.beginTransaction(SPISettings(16000000, MSBFIRST, SPI_MODE0));

  // configure each MAX7219
  shiftAll(0x0f, 0x00); // display test register - test mode off
  shiftAll(0x0b, 0x07); // scan limit register - display digits 0 thru 7
  shiftAll(0x0c, 0x01); // shutdown register - normal operation
  shiftAll(0x0a, 0x0f); // intensity register - max brightness
  shiftAll(0x09, 0x00); // decode mode register - No decode
}

void loop() {
  // generate three different frequencies of sine/cosine waves
  static int16_t sx1 = 14 << 8, sx2 = sx1, sx3, sy1, sy2, sy3 = 127 << 8;
  sx1 -= sy1 >> 6, sy1 += sx1 >> 6;
  sx2 -= sy2 >> 5, sy2 += sx2 >> 5;
  sx3 -= sy3 >> 8, sy3 += sx3 >> 8;
  // move the origin in a Lissajous curve, and to-and-fro on a sine
  tunnel((sx1 >> 6) - X_SEGMENTS * 16, (sx2 >> 6) - Y_SEGMENTS * 16, sx3 >> 6);

  if (1) { // show frame timings
    static uint64_t fps_us;
    static uint16_t frame;
    if (++frame == 1024) {
      uint32_t time_us = micros() - fps_us;
      Serial.print(time_us / float(frame));
      Serial.print("us\t");
      Serial.print(frame * 1000000.f / time_us, 3);
      Serial.println("FPS");
      fps_us = micros();
      frame = 0;
    }
  }

  // cap the refresh rate to 72Hz
  uint32_t fps_goal_us = 1000000 / 72;
  static uint64_t next_frame_us = 0;
  if ((next_frame_us += fps_goal_us) < micros())
    next_frame_us = micros();
  while (const uint64_t delay_us = next_frame_us - micros() < fps_goal_us)
    delayMicroseconds(delay_us);

  show();
}


// write data to the config registers of each MAX7219
void shiftAll(const byte send_to_address, const byte send_this_data) {
  digitalWrite(CS, LOW);
  for (int i = 0; i < NUM_SEGMENTS; i++) {
    SPI.transfer(send_to_address);
    SPI.transfer(send_this_data);
  }
  digitalWrite(CS, HIGH);
}


#pragma GCC push_options
#pragma GCC optimize "-O2"

// send the raster order framebuffer in the correct order
// for the boustrophedon layout of daisy-chained MAX7219s
void show() {
  for (byte row = 0; row < 8; row++) {
    digitalWrite(CS, LOW);
    byte segment = NUM_SEGMENTS;
    while (segment--) {
      byte x = segment % X_SEGMENTS;
      byte y = segment / X_SEGMENTS * 8;
      byte addr = (row + y) * X_SEGMENTS;

      if (segment & X_SEGMENTS) { // odd rows of segments
        SPI.transfer(8 - row);
        byte c = fb[addr + x];
        // reverse the byte (LSB to MSB)
        c = ((c >> 1) & 0x55) | ((c << 1) & 0xAA);
        c = ((c >> 2) & 0x33) | ((c << 2) & 0xCC);
        c = (c >> 4) | (c << 4);
        SPI.transfer(c);
      } else { // even rows of segments
        SPI.transfer(1 + row);
        SPI.transfer(fb[addr - x + X_SEGMENTS - 1]);
      }
    }
    digitalWrite(CS, HIGH);
  }
}
#pragma GCC pop_options


// integer square root
uint8_t isqrt16(uint16_t x) {
  uint8_t res = 0;
  uint8_t add = 0x80;
  do {
    uint8_t t = res | add;
    uint16_t t2 = (uint16_t) t * t;
    if (x >= t2) res = t;
  } while (add >>= 1);
  return res;
}

uint16_t isqrt32(uint32_t x) {
  uint16_t res = 0;
  uint16_t add = 0x8000;
  do {
    uint16_t t = res | add;
    uint32_t t2 = (uint32_t) t * t;
    if (x >= t2) res = t;
  } while (add >>= 1);
  return res;
}


#pragma GCC push_options
#pragma GCC optimize "-Ofast"
inline void __attribute__((always_inline)) emit_pixel(byte* &dst, const uint16_t radius_pos, const uint16_t xroot, const uint8_t screenx) {
  static byte out = 0;
  out <<= 1;
  if ((depth(xroot) + radius_pos) & 16)
    out |= 1;
  // if ((xroot + radius_pos) & 16)
  //   out |= 1;
  if (!(screenx & 7))
    *dst++ = out;
}


void tunnel(const int16_t x_pos, const int16_t y_pos, const uint16_t radius_pos) {
  byte* dst = fb;
  uint8_t  screenx, screeny;
  uint16_t xroot, yroot;
  uint32_t xsumsquares, ysumsquares, xnextsquare, ynextsquare;
  int16_t  x, y;

  // offset the origin in screen space
  x = x_pos;
  y = y_pos;
  ysumsquares = x * x + y * y;
  yroot = isqrt32(ysumsquares);
  ynextsquare = yroot * yroot;

  // Quadrant II (top-left)
  screeny = Y_SEGMENTS * 8;
  while (y < 0 && screeny) {
    screeny--;
    x = x_pos;
    screenx = X_SEGMENTS * 8;
    xsumsquares = ysumsquares;
    xroot = yroot;
    if (x < 0) {
      xnextsquare = xroot * xroot;
      while (x < 0 && screenx) {
        screenx--;
        emit_pixel(dst, radius_pos, xroot, screenx);
        for (byte i = 4; i-- && x < 0; )
          if ((xsumsquares += 2 * x++ + 1) < xnextsquare)
            xnextsquare -= 2 * xroot-- - 1;
      }
    }
    // Quadrant I (top-right)
    if (screenx) {
      xnextsquare = (xroot + 1) * (xroot + 1);
      while (screenx) {
        screenx--;
        emit_pixel(dst, radius_pos, xroot, screenx);
        for (byte i = 4; i--; )
          if ((xsumsquares += 2 * x++ + 1) >= xnextsquare)
            xnextsquare += 2 * ++xroot + 1;
      }
    }
    for (byte i = 4; i-- && y < 0; )
      if ((ysumsquares += 2 * y++ + 1) < ynextsquare)
        ynextsquare -= 2 * yroot-- - 1;
  }
  // Quadrant III (bottom-left)
  ynextsquare = (yroot + 1) * (yroot + 1);
  while (screeny) {
    screeny--;
    x = x_pos;
    screenx = X_SEGMENTS * 8;
    xsumsquares = ysumsquares;
    xroot = yroot;
    if (x < 0) {
      xnextsquare = xroot * xroot;
      while (x < 0 && screenx) {
        screenx--;
        emit_pixel(dst, radius_pos, xroot, screenx);
        for (byte i = 4; i-- && x < 0; )
          if ((xsumsquares += 2 * x++ + 1) < xnextsquare)
            xnextsquare -= 2 * xroot-- - 1;
      }
    }
    // Quadrant IV (bottom-right)
    if (screenx) {
      xnextsquare = (xroot + 1) * (xroot + 1);
      while (screenx--) {
        emit_pixel(dst, radius_pos, xroot, screenx);
        for (byte i = 4; i--; )
          if ((xsumsquares += 2 * x++ + 1) >= xnextsquare)
            xnextsquare += 2 * ++xroot + 1;
      }
    }
    for (byte i = 4; i--; )
      if ((ysumsquares += 2 * y++ + 1) >= ynextsquare)
        ynextsquare += 2 * ++yroot + 1;
  }
}
#pragma GCC pop_options