| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409 |
- // Copyright (c) Tailscale Inc & AUTHORS
- // SPDX-License-Identifier: BSD-3-Clause
- // Create two wgengine instances and pass data through them, measuring
- // throughput, latency, and packet loss.
- package main
- import (
- "bufio"
- "io"
- "log"
- "net"
- "net/http"
- "net/http/pprof"
- "net/netip"
- "os"
- "strconv"
- "sync"
- "time"
- "tailscale.com/types/logger"
- )
- const PayloadSize = 1000
- const ICMPMinSize = 24
- var Addr1 = netip.MustParsePrefix("100.64.1.1/32")
- var Addr2 = netip.MustParsePrefix("100.64.1.2/32")
- func main() {
- var logf logger.Logf = log.Printf
- log.SetFlags(0)
- debugMux := newDebugMux()
- go runDebugServer(debugMux, "0.0.0.0:8999")
- mode, err := strconv.Atoi(os.Args[1])
- if err != nil {
- log.Fatalf("%q: %v", os.Args[1], err)
- }
- traf := NewTrafficGen(nil)
- // Sample test results below are using GOMAXPROCS=2 (for some
- // tests, including wireguard-go, higher GOMAXPROCS goes slower)
- // on apenwarr's old Linux box:
- // Intel(R) Core(TM) i7-4785T CPU @ 2.20GHz
- // My 2019 Mac Mini is about 20% faster on most tests.
- switch mode {
- // tx=8786325 rx=8786326 (0 = 0.00% loss) (70768.7 Mbits/sec)
- case 1:
- setupTrivialNoAllocTest(logf, traf)
- // tx=6476293 rx=6476293 (0 = 0.00% loss) (52249.7 Mbits/sec)
- case 2:
- setupTrivialTest(logf, traf)
- // tx=1957974 rx=1958379 (0 = 0.00% loss) (15939.8 Mbits/sec)
- case 11:
- setupBlockingChannelTest(logf, traf)
- // tx=728621 rx=701825 (26620 = 3.65% loss) (5525.2 Mbits/sec)
- // (much faster on macOS??)
- case 12:
- setupNonblockingChannelTest(logf, traf)
- // tx=1024260 rx=941098 (83334 = 8.14% loss) (7516.6 Mbits/sec)
- // (much faster on macOS??)
- case 13:
- setupDoubleChannelTest(logf, traf)
- // tx=265468 rx=263189 (2279 = 0.86% loss) (2162.0 Mbits/sec)
- case 21:
- setupUDPTest(logf, traf)
- // tx=1493580 rx=1493580 (0 = 0.00% loss) (12210.4 Mbits/sec)
- case 31:
- setupBatchTCPTest(logf, traf)
- // tx=134236 rx=133166 (1070 = 0.80% loss) (1088.9 Mbits/sec)
- case 101:
- setupWGTest(nil, logf, traf, Addr1, Addr2)
- default:
- log.Fatalf("provide a valid test number (0..n)")
- }
- logf("initialized ok.")
- traf.Start(Addr1.Addr(), Addr2.Addr(), PayloadSize+ICMPMinSize, 0)
- var cur, prev Snapshot
- var pps int64
- i := 0
- for {
- i += 1
- time.Sleep(10 * time.Millisecond)
- if (i % 100) == 0 {
- prev = cur
- cur = traf.Snap()
- d := cur.Sub(prev)
- if prev.WhenNsec == 0 {
- logf("tx=%-6d rx=%-6d", d.TxPackets, d.RxPackets)
- } else {
- logf("%v @%7d pkt/s", d, pps)
- }
- }
- pps = traf.Adjust()
- }
- }
- func newDebugMux() *http.ServeMux {
- mux := http.NewServeMux()
- mux.HandleFunc("/debug/pprof/", pprof.Index)
- mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
- mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
- mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
- mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
- return mux
- }
- func runDebugServer(mux *http.ServeMux, addr string) {
- srv := &http.Server{
- Addr: addr,
- Handler: mux,
- }
- if err := srv.ListenAndServe(); err != nil {
- log.Fatal(err)
- }
- }
- // The absolute minimal test of the traffic generator: have it fill
- // a packet buffer, then absorb it again. Zero packet loss.
- func setupTrivialNoAllocTest(logf logger.Logf, traf *TrafficGen) {
- go func() {
- b := make([]byte, 1600)
- for {
- n := traf.Generate(b, 16)
- if n == 0 {
- break
- }
- traf.GotPacket(b[0:n+16], 16)
- }
- }()
- }
- // Almost the same, but this time allocate a fresh buffer each time
- // through the loop. Still zero packet loss. Runs about 2/3 as fast for me.
- func setupTrivialTest(logf logger.Logf, traf *TrafficGen) {
- go func() {
- for {
- b := make([]byte, 1600)
- n := traf.Generate(b, 16)
- if n == 0 {
- break
- }
- traf.GotPacket(b[0:n+16], 16)
- }
- }()
- }
- // Pass packets through a blocking channel between sender and receiver.
- // Still zero packet loss since the sender stops when the channel is full.
- // Max speed depends on channel length (I'm not sure why).
- func setupBlockingChannelTest(logf logger.Logf, traf *TrafficGen) {
- ch := make(chan []byte, 1000)
- go func() {
- // transmitter
- for {
- b := make([]byte, 1600)
- n := traf.Generate(b, 16)
- if n == 0 {
- close(ch)
- break
- }
- ch <- b[0 : n+16]
- }
- }()
- go func() {
- // receiver
- for b := range ch {
- traf.GotPacket(b, 16)
- }
- }()
- }
- // Same as setupBlockingChannelTest, but now we drop packets whenever the
- // channel is full. Max speed is about the same as the above test, but
- // now with nonzero packet loss.
- func setupNonblockingChannelTest(logf logger.Logf, traf *TrafficGen) {
- ch := make(chan []byte, 1000)
- go func() {
- // transmitter
- for {
- b := make([]byte, 1600)
- n := traf.Generate(b, 16)
- if n == 0 {
- close(ch)
- break
- }
- select {
- case ch <- b[0 : n+16]:
- default:
- }
- }
- }()
- go func() {
- // receiver
- for b := range ch {
- traf.GotPacket(b, 16)
- }
- }()
- }
- // Same as above, but at an intermediate blocking channel and goroutine
- // to make things a little more like wireguard-go. Roughly 20% slower than
- // the single-channel version.
- func setupDoubleChannelTest(logf logger.Logf, traf *TrafficGen) {
- ch := make(chan []byte, 1000)
- ch2 := make(chan []byte, 1000)
- go func() {
- // transmitter
- for {
- b := make([]byte, 1600)
- n := traf.Generate(b, 16)
- if n == 0 {
- close(ch)
- break
- }
- select {
- case ch <- b[0 : n+16]:
- default:
- }
- }
- }()
- go func() {
- // intermediary
- for b := range ch {
- ch2 <- b
- }
- close(ch2)
- }()
- go func() {
- // receiver
- for b := range ch2 {
- traf.GotPacket(b, 16)
- }
- }()
- }
- // Instead of a channel, pass packets through a UDP socket.
- func setupUDPTest(logf logger.Logf, traf *TrafficGen) {
- la, err := net.ResolveUDPAddr("udp", ":0")
- if err != nil {
- log.Fatalf("resolve: %v", err)
- }
- s1, err := net.ListenUDP("udp", la)
- if err != nil {
- log.Fatalf("listen1: %v", err)
- }
- s2, err := net.ListenUDP("udp", la)
- if err != nil {
- log.Fatalf("listen2: %v", err)
- }
- a2 := s2.LocalAddr()
- // On macOS (but not Linux), you can't transmit to 0.0.0.0:port,
- // which is what returns from .LocalAddr() above. We have to
- // force it to localhost instead.
- a2.(*net.UDPAddr).IP = net.ParseIP("127.0.0.1")
- s1.SetWriteBuffer(1024 * 1024)
- s2.SetReadBuffer(1024 * 1024)
- go func() {
- // transmitter
- b := make([]byte, 1600)
- for {
- n := traf.Generate(b, 16)
- if n == 0 {
- break
- }
- s1.WriteTo(b[16:n+16], a2)
- }
- }()
- go func() {
- // receiver
- b := make([]byte, 1600)
- for traf.Running() {
- // Use ReadFrom instead of Read, to be more like
- // how wireguard-go does it, even though we're not
- // going to actually look at the address.
- n, _, err := s2.ReadFrom(b)
- if err != nil {
- log.Fatalf("s2.Read: %v", err)
- }
- traf.GotPacket(b[:n], 0)
- }
- }()
- }
- // Instead of a channel, pass packets through a TCP socket.
- // TCP is a single stream, so we can amortize one syscall across
- // multiple packets. 10x amortization seems to make it go ~10x faster,
- // as expected, getting us close to the speed of the channel tests above.
- // There's also zero packet loss.
- func setupBatchTCPTest(logf logger.Logf, traf *TrafficGen) {
- sl, err := net.Listen("tcp", ":0")
- if err != nil {
- log.Fatalf("listen: %v", err)
- }
- var slCloseOnce sync.Once
- slClose := func() {
- slCloseOnce.Do(func() {
- sl.Close()
- })
- }
- s1, err := net.Dial("tcp", sl.Addr().String())
- if err != nil {
- log.Fatalf("dial: %v", err)
- }
- s2, err := sl.Accept()
- if err != nil {
- log.Fatalf("accept: %v", err)
- }
- s1.(*net.TCPConn).SetWriteBuffer(1024 * 1024)
- s2.(*net.TCPConn).SetReadBuffer(1024 * 1024)
- ch := make(chan int)
- go func() {
- // transmitter
- defer slClose()
- defer s1.Close()
- bs1 := bufio.NewWriterSize(s1, 1024*1024)
- b := make([]byte, 1600)
- i := 0
- for {
- i += 1
- n := traf.Generate(b, 16)
- if n == 0 {
- break
- }
- if i == 1 {
- ch <- n
- }
- bs1.Write(b[16 : n+16])
- // TODO: this is a pretty half-baked batching
- // function, which we'd never want to employ in
- // a real-life program.
- //
- // In real life, we'd probably want to flush
- // immediately when there are no more packets to
- // generate, and queue up only if we fall behind.
- //
- // In our case however, we just want to see the
- // technical benefits of batching 10 syscalls
- // into 1, so a fixed ratio makes more sense.
- if (i % 10) == 0 {
- bs1.Flush()
- }
- }
- }()
- go func() {
- // receiver
- defer slClose()
- defer s2.Close()
- bs2 := bufio.NewReaderSize(s2, 1024*1024)
- // Find out the packet size (we happen to know they're
- // all the same size)
- packetSize := <-ch
- b := make([]byte, packetSize)
- for traf.Running() {
- // TODO: can't use ReadFrom() here, which is
- // unfair compared to UDP. (ReadFrom for UDP
- // apparently allocates memory per packet, which
- // this test does not.)
- n, err := io.ReadFull(bs2, b)
- if err != nil {
- log.Fatalf("s2.Read: %v", err)
- }
- traf.GotPacket(b[:n], 0)
- }
- }()
- }
|