#import "@preview/polylux:0.3.1": *
#import "@preview/cetz:0.1.2": canvas, plot
#import "@preview/tablex:0.0.6": tablex, rowspanx, colspanx, hlinex, cellx

#import "@preview/metro:0.1.0": *
#import units:*

#import "../src/tumtheme.typ": *

#show: tum-theme.with(
    footer: [Alex Daichendt -- Chiplet Technology and the Impact of NUMA on Applications],
)

#let ftnt(body) = footnote(text(size: 8pt, body))


#let title = "TUM test slides"

#title-slide(
  title: "Chiplet Technology and the Impact of NUMA on Applications", 
  authors: "Alex Daichendt", 
  chair: "IN2076 Advanced Computer Architecture"
)

#let nm = unit("nano meter")

#slide()[
  = Intel Press Workshops June 2017 #ftnt(link("https://www.techpowerup.com/235092/intel-says-amd-epyc-processors-glued-together-in-official-slide-deck", "TechPowerUp; no primary source available"))

  #v(1cm)
  #align(center, image(width: 100%, "./figures/intel_slide1.jpg"))
]

#slide()[
  = Why chiplets?
  #v(2cm)

  - Moore's Law 
  - more flexibility in design
  #pause
  - low production yield for monolithic dies  \
    #sym.arrow.r \$\$\$

  #v(2cm)
  #let nm = ("45" + nm, "32" + nm, "28" + nm, "20" + nm, "14" + nm, "10" + nm, "7"+nm, "5" + nm)
  #figure(
    canvas(length: 1.5cm, {
      plot.plot(size: (10, 4),
        x-tick-step: 1,
        y-tick-step: 1,
        y-max: 6.0,
        x-min: 0,
        y-min: 0,
        x-grid: true,
        y-grid: true,
        y-label: "Normalized Cost",
        x-label: none,
        x-format: value => nm.at(int(value)),
        {
          plot.add(
            mark: "triangle",
            mark-size: 0.1cm,
            ((0,1),(1,1.5), (2,1.7), (3,1.95), (4, 2.1), (6,3.8), (7, 4.95))
            )
          
        })
    }), 
    caption: [Normalized cost per chip vs. technology node, based on Naffziger et al.#ftnt(cite(form: "full", <Naffziger2021>))])

]

#slide()[
= AMD Naples (1#super[st] Gen. EPYC) -- NUMA Toplogy #ftnt(link("https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56308-numa-topology-for-epyc-naples-family-processors.pdf"))
  #v(1cm)
  #figure(image(width: 100%, "./figures/naples.jpg"))
]

#slide()[
= AMD Naples (1#super[st] Gen. EPYC) 
  #figure(image(width: 90%, "./figures/naples-multilayerpackaging.jpg"), 
  caption: [Multi-layer package routing, DDR (red), IO (orange), infinity-fabric (blue) #ftnt(cite(form: "full", <Naffziger2021>))])
]

#slide()[
= AMD Rome (2#super[nd] Gen. EPYC) #ftnt(link("https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/tuning-guides/amd-epyc-7002-tg-hpc-56827.pdf"))
  #v(1cm)
  #figure(image(width: 55%, "./figures/rome.jpg"))
]

#slide()[
  = Memory Access Latencies for Naples and Rome, Naffziger et al.#ftnt(cite(form: "full", <Naffziger2020>))
  #v(1cm)
  #align(center, 
      image(width: 100%, "./figures/naples-vs-rome.jpg")
  )
]

#slide()[
  = Impact of NUMA on Applications

  #v(2cm)
  #figure(image(width: 100%, "./figures/userspacenetworkingdrivers.png"))
]

#slide()[
  = Impact of NUMA on Applications
  == Emmerich et al. #ftnt(cite(form: "full", <Emmerich2018>)) -- User Space Networking Drivers
  #v(2cm)
  #figure(
    tablex(
      columns: 5,
      inset: 10pt,
      header-rows: 1,
      auto-vlines: false,
      auto-lines: false,
      caption: ["aa"],
      [Ingress NIC], [Egress NIC], [CPU], [Memory], [Throughput],
      hlinex(),
      [Node 0], [Node 0], [Node 0], [Node 0], [10.8M pps],
      [Node 0], [Node 0], [Node 0], [Node 1], [10.8M pps],
      [Node 0], [Node 0], [Node 1], [Node 0], [7.6M pps],
      [Node 0], [Node 0], [Node 1], [Node 1], cellx(fill:red)[6.6M pps],
      [Node 0], [Node 1], [Node 0], [Node 0], [7.9M pps],
      [Node 0], [Node 1], [Node 0], [Node 1], [10.0M pps],
      [Node 0], [Node 1], [Node 1], [Node 0], [8.6M pps],
      [Node 0], [Node 1], [Node 1], [Node 1], [8.1M pps],
      hlinex()

    ),
  caption: [Forwarding performance in packets per second, columns indicates pinning of each resource, based on Emmerich et al. #cite(<Emmerich2018>)]
  )
]

#slide()[
  = Impact of NUMA on Applications

  #v(2cm)
  #figure(image(width: 100%, "./figures/talesoftail.png"))
]

#slide()[
  = Impact of NUMA on Applications
  == Li et al. #ftnt(cite(form: "full", <Li2014>)) -- Memcached
  #v(2cm)
  #figure(
    image(width: 60%, "./figures/talesoftail_diagram.png"),
    caption: [Memcached tail latency; 2 sockets; two instances (green), one instance (blue), \ 
    based on Li et al. #cite(<Li2014>)]
  )
]

#slide()[
  = Conclusion
  #v(2cm)

  - Chiplet technology is a fundamental part of future CPU architectures
  - Inconsistent memory access latencies are a challenge for applications
  - CPU architecture matters

]

#slide()[
#bibliography("lib.bib")
]