#import "@preview/polylux:0.3.1": * #import "@preview/cetz:0.1.2": canvas, plot #import "@preview/tablex:0.0.6": tablex, rowspanx, colspanx, hlinex, cellx #import "@preview/metro:0.1.0": * #import units:* #import "../src/tumtheme.typ": * #show: tum-theme.with( footer: [Alex Daichendt -- Chiplet Technology and the Impact of NUMA on Applications], ) #let ftnt(body) = footnote(text(size: 8pt, body)) #let title = "TUM test slides" #title-slide( title: "Chiplet Technology and the Impact of NUMA on Applications", authors: "Alex Daichendt", chair: "IN2076 Advanced Computer Architecture" ) #let nm = unit("nano meter") #slide()[ = Intel Press Workshops June 2017 #ftnt(link("https://www.techpowerup.com/235092/intel-says-amd-epyc-processors-glued-together-in-official-slide-deck", "TechPowerUp; no primary source available")) #v(1cm) #align(center, image(width: 100%, "./figures/intel_slide1.jpg")) ] #slide()[ = Why chiplets? #v(2cm) - Moore's Law - more flexibility in design #pause - low production yield for monolithic dies \ #sym.arrow.r \$\$\$ #v(2cm) #let nm = ("45" + nm, "32" + nm, "28" + nm, "20" + nm, "14" + nm, "10" + nm, "7"+nm, "5" + nm) #figure( canvas(length: 1.5cm, { plot.plot(size: (10, 4), x-tick-step: 1, y-tick-step: 1, y-max: 6.0, x-min: 0, y-min: 0, x-grid: true, y-grid: true, y-label: "Normalized Cost", x-label: none, x-format: value => nm.at(int(value)), { plot.add( mark: "triangle", mark-size: 0.1cm, ((0,1),(1,1.5), (2,1.7), (3,1.95), (4, 2.1), (6,3.8), (7, 4.95)) ) }) }), caption: [Normalized cost per chip vs. technology node, based on Naffziger et al.#ftnt(cite(form: "full", ))]) ] #slide()[ = AMD Naples (1#super[st] Gen. EPYC) -- NUMA Toplogy #ftnt(link("https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56308-numa-topology-for-epyc-naples-family-processors.pdf")) #v(1cm) #figure(image(width: 100%, "./figures/naples.jpg")) ] #slide()[ = AMD Naples (1#super[st] Gen. EPYC) #figure(image(width: 90%, "./figures/naples-multilayerpackaging.jpg"), caption: [Multi-layer package routing, DDR (red), IO (orange), infinity-fabric (blue) #ftnt(cite(form: "full", ))]) ] #slide()[ = AMD Rome (2#super[nd] Gen. EPYC) #ftnt(link("https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/tuning-guides/amd-epyc-7002-tg-hpc-56827.pdf")) #v(1cm) #figure(image(width: 55%, "./figures/rome.jpg")) ] #slide()[ = Memory Access Latencies for Naples and Rome, Naffziger et al.#ftnt(cite(form: "full", )) #v(1cm) #align(center, image(width: 100%, "./figures/naples-vs-rome.jpg") ) ] #slide()[ = Impact of NUMA on Applications #v(2cm) #figure(image(width: 100%, "./figures/userspacenetworkingdrivers.png")) ] #slide()[ = Impact of NUMA on Applications == Emmerich et al. #ftnt(cite(form: "full", )) -- User Space Networking Drivers #v(2cm) #figure( tablex( columns: 5, inset: 10pt, header-rows: 1, auto-vlines: false, auto-lines: false, caption: ["aa"], [Ingress NIC], [Egress NIC], [CPU], [Memory], [Throughput], hlinex(), [Node 0], [Node 0], [Node 0], [Node 0], [10.8M pps], [Node 0], [Node 0], [Node 0], [Node 1], [10.8M pps], [Node 0], [Node 0], [Node 1], [Node 0], [7.6M pps], [Node 0], [Node 0], [Node 1], [Node 1], cellx(fill:red)[6.6M pps], [Node 0], [Node 1], [Node 0], [Node 0], [7.9M pps], [Node 0], [Node 1], [Node 0], [Node 1], [10.0M pps], [Node 0], [Node 1], [Node 1], [Node 0], [8.6M pps], [Node 0], [Node 1], [Node 1], [Node 1], [8.1M pps], hlinex() ), caption: [Forwarding performance in packets per second, columns indicates pinning of each resource, based on Emmerich et al. #cite()] ) ] #slide()[ = Impact of NUMA on Applications #v(2cm) #figure(image(width: 100%, "./figures/talesoftail.png")) ] #slide()[ = Impact of NUMA on Applications == Li et al. #ftnt(cite(form: "full", )) -- Memcached #v(2cm) #figure( image(width: 60%, "./figures/talesoftail_diagram.png"), caption: [Memcached tail latency; 2 sockets; two instances (green), one instance (blue), \ based on Li et al. #cite()] ) ] #slide()[ = Conclusion #v(2cm) - Chiplet technology is a fundamental part of future CPU architectures - Inconsistent memory access latencies are a challenge for applications - CPU architecture matters ] #slide()[ #bibliography("lib.bib") ]