from amaranth import Elaboratable, Module, Signal
from amaranth.lib.data import ArrayLayout, StructLayout
from amaranth.back import rtlil, verilog
from amaranth.sim import Simulator

from collections import deque
import subprocess
import re
import sys


class RunnerError(Exception):
    pass


class NaiveMul(Elaboratable):
    def __init__(self, width=8):
        self.a = Signal(width)
        self.b = Signal(width)
        self.o = Signal(2*width)

    def elaborate(self, platform):
        m = Module()

        m.d.comb += self.o.eq(self.a * self.b)

        return m


class PipelinedMul(Elaboratable):
    def __init__(self, width=8, debug=False):
        self.width = width
        self.a = Signal(self.width)
        self.b = Signal(self.width)
        self.o = Signal(2*self.width)
        self.debug = debug

    #                   x7x6x5x4x3x2x1x0
    #                 * y7y6y5y4y3y2y1y0
    #                   ----------------
    #                   x7x6x5x4x3x2x1x0: z0 = x7x6x5x4x3x2x1x0*y0
    #                 x7x6x5x4x3x2x1x0: z1 = (x7x6x5x4x3x2x1x0*y1 << 1) + z0
    #               x7x6x5x4x3x2x1x0: z2 = (x7x6x5x4x3x2x1x0*y2 << 2) + z1
    #             x7x6x5x4x3x2x1x0: z3 = (x7x6x5x4x3x2x1x0*y3 << 3) + z2
    #           x7x6x5x4x3x2x1x0: z4 = (x7x6x5x4x3x2x1x0*y4 << 4) + z3
    #         x7x6x5x4x3x2x1x0: z5 = (x7x6x5x4x3x2x1x0*y5 << 5) + z4
    #       x7x6x5x4x3x2x1x0: z6 = (x7x6x5x4x3x2x1x0*y6 << 6) + z5
    #   + x7x6x5x4x3x2x1x0: z7 = (x7x6x5x4x3x2x1x0*y7 << 7) + z6

    def elaborate(self, platform):
        def prob_pipeline_stage(i):
            if i == 0:
                stage_out = Signal(16)
                m.d.comb += stage_out.eq(pipeline_out[0])
                self.pin = pipeline_in
                self.pout = pipeline_out
            else:
                stage_ina = Signal(8)
                stage_inb = Signal(8)
                m.d.comb += stage_ina.eq(self.pin[i - 1].a)
                m.d.comb += stage_inb.eq(self.pin[i - 1].b)

                stage_out = Signal(16)
                m.d.comb += stage_out.eq(self.pout[i])

        m = Module()

        pipeline_in = Signal(ArrayLayout(
                                         StructLayout(members={
                                            "a": self.a.shape(),
                                            "b": self.b.shape()
                                         }),
                                         self.width))

        # Relies on the optimizer to realize that not all 2*self.width^2
        # bits are actually used (we only need 80% of them for width=8).
        pipeline_out = Signal(ArrayLayout(self.width*2, self.width))
        prob_pipeline_stage(0)

        m.d.sync += [
            pipeline_in[0].a.eq(self.a),
            pipeline_in[0].b.eq(self.b),
        ]

        m.d.sync += pipeline_out[0].eq(self.a * self.b[0])

        for i in range(1, self.width):
            if self.debug:
                prob_pipeline_stage(i)

            m.d.sync += pipeline_in[i].eq(pipeline_in[i - 1])
            # This relies on the optimizer realizing we're doing a mul by a
            # 1 bit number (pipeline_in[i - 1].b[i]) with leading zeros.
            m.d.sync += pipeline_out[i].eq(((pipeline_in[i - 1].a *
                                             pipeline_in[i - 1].b[i]) << i) +
                                           pipeline_out[i - 1])

        m.d.comb += self.o.eq(pipeline_out[self.width - 1])

        return m


def stats(m):
    # if args.action == "size":
    # fragment = Fragment.get(design, platform)
    # rtlil_text = rtlil.convert(fragment, name=name, ports=ports)
    rtlil_text = rtlil.convert(m, ports=[m.a, m.b, m.o])

    # Created from a combination of amaranth._toolchain.yosys and
    # amaranth.back.verilog. Script comes from nextpnr-generic.
    script = []
    script.append("read_ilang <<rtlil\n{}\nrtlil".format(rtlil_text))
    script.append("hierarchy -check")
    script.append("proc")
    script.append("flatten")
    script.append("tribuf -logic")
    script.append("deminout")
    script.append("synth -run coarse")
    script.append("memory_map")
    script.append("opt -full")
    script.append("techmap -map +/techmap.v")
    script.append("opt -fast")
    script.append("dfflegalize -cell $_DFF_P_ 0")
    script.append("abc -lut 4 -dress")
    script.append("clean -purge")
    # if args.show:
    script.append("show")
    script.append("hierarchy -check")
    script.append("stat")

    stdin = "\n".join(script)

    popen = subprocess.Popen(["yosys", "-"],
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             encoding="utf-8")
    stdout, stderr = popen.communicate(stdin)
    if popen.returncode:
        raise RunnerError(stderr.strip())

    # if args.verbose:
    #    print(stdout)
    # else:
    begin_re = re.compile(r"[\d.]+ Printing statistics.")
    end_re = re.compile(r"End of script.")
    capture = False
    # begin_l = 0
    # end_l = 0

    for i, l in enumerate(stdout.split("\n")):
        if begin_re.match(l):
            capture = True

        if end_re.match(l):
            capture = False

        if capture:
            print(l)


def pipe_tb(m):
    def testbench():
        # Pipeline previous inputs... inputs to prev.append() are what
        # will go into the multiplier at the next active edge.
        # Outputs from prev.popleft()  are what went into the multiplier
        # "m.width - 1" active edges ago. This leads to a latency of
        # "m.width" clock cycles/ticks since the multiplier saw the inputs.
        prev = deque([(0, 0)]*m.width)

        for a in range(0, 2**m.width):
            print(a)
            yield m.a.eq(a)
            for b in range(0, 2**m.width):
                yield m.b.eq(b)

                yield
                (a_c, b_c) = prev.popleft()
                prev.append((a, b))

                assert a_c*b_c == (yield m.o)

                # print((a, b), (a_c, b_c), a_c*b_c, (yield m.o))
                # for i in range(8):
                #     print(f"{yield m.pin[i].a:08b}, {yield m.pin[i].b:08b}")
                # for i in range(8):
                #      print(f"{yield m.pout[i]:016b}")

    sim = Simulator(m)

    sim.add_sync_process(testbench)
    sim.add_clock(12e-6)

    with sim.write_vcd("pipe.vcd", "pipe.gtkw"):
        sim.run()


if __name__ == "__main__":
    m = PipelinedMul()
    if len(sys.argv) > 1 and sys.argv[1] == "sim":
        pipe_tb(m)
    elif len(sys.argv) > 1 and sys.argv[1] == "stats":
        stats(m)
    else:
        print(verilog.convert(m, ports=[m.a, m.b, m.o]))