`timescale 1ns / 100ps

/* SPI slave module
 *
 * Expected configuration on the master is SPI mode 0
 * (CPOL 0 CPHA 0)
 *  - SCK idle state is 0
 *  - We latch input on positive edges of SCK
 *  - We setup output on negative edge of SCK
 * 
 * IE. For a PIC, this means setting:
 *    CKP=0 CKE=1 SMP=0
 */
module spi_slave
(
   input		sysclk,
   input		reset,  /* Async system reset */
   input		_scs,
   input		sdi,
   output		sdo,
   input		sck,
   input [7:0]		wdata,	/* Output data */
   output reg [7:0]	rdata,  /* Input data */
   output		rx,	/* One-sysclk signal of a new byte */
   output reg		cmd	/* Indicate first byte of rx, variable len */
);

	reg [6:0] in_sr;
	reg [7:0] out_sr;
	reg [2:0] cnt;	
	reg 	  first;
	reg 	  new_byte;
	reg 	  rx1;
	reg 	  rx2;
	reg 	  rx3;
	reg 	  tx;	

	/* Bit counter */
	always@(posedge sck or posedge _scs) begin
		if (_scs) begin
			cnt <= 0;
		end else begin
			cnt <= cnt + 1;
		end
	end
	
	/* Shift input, latch on positive edge of sck. We only shift in 7
	 * bits as we'll take sdi directly into the final latch.
	 */
	always@(posedge sck) begin
		if (!_scs) begin
			in_sr[6:0] <= { in_sr[5:0], sdi };
		end
	end

	/* We latch the input byte so it remains stable for a while byte for
	 * consumption by sysclk domain.
	 */
	always@(posedge sck) begin
		if (!_scs && cnt == 3'b111) begin
			rdata[7:0] <= { in_sr[6:0], sdi };
		end
	end

	/* Shift output. We latch "data" on bit 7 falling edge (so before we
	 * set new_byte and thus before rx catches up sysclk domain).
	 * 
	 * That means we are ahead by one bit, so we add a one bit delay latch
	 * on sdo
	 * 
	 */
	always@(negedge sck or posedge _scs) begin
		if (_scs) begin
			out_sr <= 8'hff;
		end else begin		
			if (cnt == 3'b111) begin
				out_sr <= wdata;
			end else begin
				out_sr[7:0] <= { out_sr[6:0], 1'b0 };
			end			
		end
	end

	always@(negedge sck or posedge _scs) begin
		if (_scs) begin
			tx <= 1;
		end else begin
			tx <= out_sr[7];
		end
	end	
	
	/* We keep output hi-z if chip select not set. We do have a transcient
	 * undefined state between _scs assertion and the first clock, which
	 * I'm happy to ignore for now
	 */
	assign sdo = _scs ? 1'bz : tx;

	/* Now we need to generate rx. It's tricky because sck can/will stop
	 * immediately after the last bit.
	 * 
	 * What we do is we generate a signal "new_byte" at the same time as
	 * latching the input byte, which we clear half way through the next
	 * byte. We then double flip-flop synchronize that into sysclk domain
	 * where we do an edge detection. If we lose sck, this signal will
	 * remains set until we start a new transmission, but since the delay
	 * between two transmissions is unpredictable, we must make sure we
	 * don't clear it until half way through the new byte (ie, we keep it
	 * set even when SCS is gone). For the same reason we don't
	 * clear our input latch either when SCS is gone.
	 * 
	 * Now, the nasty thing is we need to reset that dude asynchronously
	 * from sysclk domain, which could probably use some sychronisation
	 * too. For now, we assume the sysclk reset happens long enough before
	 * sck toggles. This will do for us, but in a more complex system,
	 * some logic might be needed to ensure sck is effectively masked
	 * out until a few clocks after reset completes to avoid metastability
	 * 
	 * Note: FPGAs are nice, we could probably just rely on new_byte being
	 * 0 at powerup time :-)
	 */
	always@(posedge sck or posedge reset) begin
		if (reset) begin
			new_byte <= 0;
		end else begin
			if (cnt == 3'b111) begin
				new_byte <= 1;
			end else if (cnt == 3'b011) begin
				new_byte <= 0;
			end
		end		
	end

	always@(posedge sysclk or posedge reset) begin
		if (reset) begin
			rx1 <= 0;
			rx2 <= 0;
			rx3 <= 0;			
		end else begin
			rx1 <= new_byte;
			rx2 <= rx1;
			rx3 <= rx2;			
		end
	end

	assign rx = rx2 & !rx3;

	/* "first" is set in sck domain during clocking of first byte,
	 * and flushed into "cmd" at bit 7, thus cmd moves along with
	 * the data itself and will be sampled with the data by the user.
	 */
	always@(posedge sck or posedge _scs) begin
		if (_scs) begin
			first <= 1;
		end else if (cnt == 3'b111) begin
			first <= 0;
			cmd <= first;
		end
	end
endmodule