165 lines
4.6 KiB
Typst
165 lines
4.6 KiB
Typst
#import "@preview/polylux:0.3.1": *
|
|
#import "@preview/cetz:0.1.2": canvas, plot
|
|
#import "@preview/tablex:0.0.6": tablex, rowspanx, colspanx, hlinex, cellx
|
|
|
|
#import "@preview/metro:0.1.0": *
|
|
#import units:*
|
|
|
|
#import "../src/tumtheme.typ": *
|
|
|
|
#show: tum-theme.with(
|
|
footer: [Alex Daichendt -- Chiplet Technology and the Impact of NUMA on Applications],
|
|
)
|
|
|
|
#let ftnt(body) = footnote(text(size: 8pt, body))
|
|
|
|
|
|
#let title = "TUM test slides"
|
|
|
|
#title-slide(
|
|
title: "Chiplet Technology and the Impact of NUMA on Applications",
|
|
authors: "Alex Daichendt",
|
|
chair: "IN2076 Advanced Computer Architecture"
|
|
)
|
|
|
|
#let nm = unit("nano meter")
|
|
|
|
#slide()[
|
|
= Intel Press Workshops June 2017 #ftnt(link("https://www.techpowerup.com/235092/intel-says-amd-epyc-processors-glued-together-in-official-slide-deck", "TechPowerUp; no primary source available"))
|
|
|
|
#v(1cm)
|
|
#align(center, image(width: 100%, "./figures/intel_slide1.jpg"))
|
|
]
|
|
|
|
#slide()[
|
|
= Why chiplets?
|
|
#v(2cm)
|
|
|
|
- Moore's Law
|
|
- more flexibility in design
|
|
#pause
|
|
- low production yield for monolithic dies \
|
|
#sym.arrow.r \$\$\$
|
|
|
|
#v(2cm)
|
|
#let nm = ("45" + nm, "32" + nm, "28" + nm, "20" + nm, "14" + nm, "10" + nm, "7"+nm, "5" + nm)
|
|
#figure(
|
|
canvas(length: 1.5cm, {
|
|
plot.plot(size: (10, 4),
|
|
x-tick-step: 1,
|
|
y-tick-step: 1,
|
|
y-max: 6.0,
|
|
x-min: 0,
|
|
y-min: 0,
|
|
x-grid: true,
|
|
y-grid: true,
|
|
y-label: "Normalized Cost",
|
|
x-label: none,
|
|
x-format: value => nm.at(int(value)),
|
|
{
|
|
plot.add(
|
|
mark: "triangle",
|
|
mark-size: 0.1cm,
|
|
((0,1),(1,1.5), (2,1.7), (3,1.95), (4, 2.1), (6,3.8), (7, 4.95))
|
|
)
|
|
|
|
})
|
|
}),
|
|
caption: [Normalized cost per chip vs. technology node, based on Naffziger et al.#ftnt(cite(form: "full", <Naffziger2021>))])
|
|
|
|
]
|
|
|
|
#slide()[
|
|
= AMD Naples (1#super[st] Gen. EPYC) -- NUMA Toplogy #ftnt(link("https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56308-numa-topology-for-epyc-naples-family-processors.pdf"))
|
|
#v(1cm)
|
|
#figure(image(width: 100%, "./figures/naples.jpg"))
|
|
]
|
|
|
|
#slide()[
|
|
= AMD Naples (1#super[st] Gen. EPYC)
|
|
#figure(image(width: 90%, "./figures/naples-multilayerpackaging.jpg"),
|
|
caption: [Multi-layer package routing, DDR (red), IO (orange), infinity-fabric (blue) #ftnt(cite(form: "full", <Naffziger2021>))])
|
|
]
|
|
|
|
#slide()[
|
|
= AMD Rome (2#super[nd] Gen. EPYC) #ftnt(link("https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/tuning-guides/amd-epyc-7002-tg-hpc-56827.pdf"))
|
|
#v(1cm)
|
|
#figure(image(width: 55%, "./figures/rome.jpg"))
|
|
]
|
|
|
|
#slide()[
|
|
= Memory Access Latencies for Naples and Rome, Naffziger et al.#ftnt(cite(form: "full", <Naffziger2020>))
|
|
#v(1cm)
|
|
#align(center,
|
|
image(width: 100%, "./figures/naples-vs-rome.jpg")
|
|
)
|
|
]
|
|
|
|
#slide()[
|
|
= Impact of NUMA on Applications
|
|
|
|
#v(2cm)
|
|
#figure(image(width: 100%, "./figures/userspacenetworkingdrivers.png"))
|
|
]
|
|
|
|
#slide()[
|
|
= Impact of NUMA on Applications
|
|
== Emmerich et al. #ftnt(cite(form: "full", <Emmerich2018>)) -- User Space Networking Drivers
|
|
#v(2cm)
|
|
#figure(
|
|
tablex(
|
|
columns: 5,
|
|
inset: 10pt,
|
|
header-rows: 1,
|
|
auto-vlines: false,
|
|
auto-lines: false,
|
|
caption: ["aa"],
|
|
[Ingress NIC], [Egress NIC], [CPU], [Memory], [Throughput],
|
|
hlinex(),
|
|
[Node 0], [Node 0], [Node 0], [Node 0], [10.8M pps],
|
|
[Node 0], [Node 0], [Node 0], [Node 1], [10.8M pps],
|
|
[Node 0], [Node 0], [Node 1], [Node 0], [7.6M pps],
|
|
[Node 0], [Node 0], [Node 1], [Node 1], cellx(fill:red)[6.6M pps],
|
|
[Node 0], [Node 1], [Node 0], [Node 0], [7.9M pps],
|
|
[Node 0], [Node 1], [Node 0], [Node 1], [10.0M pps],
|
|
[Node 0], [Node 1], [Node 1], [Node 0], [8.6M pps],
|
|
[Node 0], [Node 1], [Node 1], [Node 1], [8.1M pps],
|
|
hlinex()
|
|
|
|
),
|
|
caption: [Forwarding performance in packets per second, columns indicates pinning of each resource, based on Emmerich et al. #cite(<Emmerich2018>)]
|
|
)
|
|
]
|
|
|
|
#slide()[
|
|
= Impact of NUMA on Applications
|
|
|
|
#v(2cm)
|
|
#figure(image(width: 100%, "./figures/talesoftail.png"))
|
|
]
|
|
|
|
#slide()[
|
|
= Impact of NUMA on Applications
|
|
== Li et al. #ftnt(cite(form: "full", <Li2014>)) -- Memcached
|
|
#v(2cm)
|
|
#figure(
|
|
image(width: 60%, "./figures/talesoftail_diagram.png"),
|
|
caption: [Memcached tail latency; 2 sockets; two instances (green), one instance (blue), \
|
|
based on Li et al. #cite(<Li2014>)]
|
|
)
|
|
]
|
|
|
|
#slide()[
|
|
= Conclusion
|
|
#v(2cm)
|
|
|
|
- Chiplet technology is a fundamental part of future CPU architectures
|
|
- Inconsistent memory access latencies are a challenge for applications
|
|
- CPU architecture matters
|
|
|
|
]
|
|
|
|
#slide()[
|
|
#bibliography("lib.bib")
|
|
]
|
|
|