Benchmarking Graphite on NVMe

2016-09-13 13:56:30 by jdixon

Here's another quick update to demonstrate what's possible with a single Graphite node running master (these Carbon and Graphite-Web commits, specifically). As you'll see in the results below, this configuration was able to achieve 300k datapoints per second.

This test was performed on a Packet type 3 server with the pair of NVMe flash drives striped in a single LVM volume. Installation of the Graphite stack was still performed using Synthesize v.2.4.1. To take advantage of the increased I/O capacity I added more cache processes for a grand total of eight (8) relays and sixteen (16) caches. Five instances of Haggar ran concurrently, on a separate Packet type 1 server in the same Parsippany, NJ datacenter.

I'm very pleased with the results of this benchmark. Although per-process CPU is high, Load is still at reasonable levels, and the box remains very responsive. There are no signs of "fatigue" other than those system metrics; the box is humming along and data retrieval times remain low. Queued datapoints in the relays and caches are stable and (at least based on Packet's published claims for these drives) there is still capacity in terms of write operations, although I'd be hesitant to increase the cache batch writes since that would require additional CPU for sorting among the in-memory queues.

$ sudo pvcreate /dev/nvme0n1
$ sudo pvcreate /dev/nvme1n1
$ sudo vgcreate graphite_vol /dev/nvme0n1 /dev/nvme1n1
$ sudo lvcreate -i 2 -I 4 -l 100%VG -n graphite_vg graphite_vol
$ sudo mkfs.ext4 /dev/graphite_vol/graphite_vg
$ sudo service carbon-cache stop
$ sudo service apache2 stop
$ sudo mkdir /opt2
$ sudo mv /opt/graphite /opt2/
$ sudo mount /dev/graphite_vol/graphite_vg /opt/
$ sudo mv /opt2/graphite /opt/
$ sudo service carbon-cache start
$ sudo service apache2 start
# carbon.conf

[relay]
USER = carbon
LOG_LISTENER_CONNECTIONS = False
RELAY_METHOD = consistent-hashing
REPLICATION_FACTOR = 1
MAX_DATAPOINTS_PER_MESSAGE = 500
MAX_QUEUE_SIZE = 10000
USE_FLOW_CONTROL = True
DESTINATIONS = 127.0.0.1:20104:1, 127.0.0.1:20204:2, 127.0.0.1:20304:3,
               127.0.0.1:20404:4, 127.0.0.1:20504:5, 127.0.0.1:20604:6,
               127.0.0.1:20704:7, 127.0.0.1:20804:8, 127.0.0.1:20904:9,
               127.0.0.1:21004:10, 127.0.0.1:21104:11, 127.0.0.1:21204:12,
               127.0.0.1:21304:13, 127.0.0.1:21404:14, 127.0.0.1:21504:15,
               127.0.0.1:21604:16

[relay:1]
LINE_RECEIVER_PORT = 2113
PICKLE_RECEIVER_PORT = 2114

[relay:2]
LINE_RECEIVER_PORT = 2213
PICKLE_RECEIVER_PORT = 2214

[relay:3]
LINE_RECEIVER_PORT = 2313
PICKLE_RECEIVER_PORT = 2314

[relay:4]
LINE_RECEIVER_PORT = 2413
PICKLE_RECEIVER_PORT = 2414

[relay:5]
LINE_RECEIVER_PORT = 2513
PICKLE_RECEIVER_PORT = 2514

[relay:6]
LINE_RECEIVER_PORT = 2613
PICKLE_RECEIVER_PORT = 2614

[relay:7]
LINE_RECEIVER_PORT = 2713
PICKLE_RECEIVER_PORT = 2714

[relay:8]
LINE_RECEIVER_PORT = 2813
PICKLE_RECEIVER_PORT = 2814

[cache]
USER = carbon
CACHE_WRITE_STRATEGY = sorted
MAX_CACHE_SIZE = 4000000
USE_FLOW_CONTROL = True
WHISPER_FALLOCATE_CREATE = True
MAX_CREATES_PER_MINUTE = 12000
MAX_UPDATES_PER_SECOND = 20000
USE_INSECURE_UNPICKLER = False
LOG_CACHE_HITS = False
LOG_CACHE_QUEUE_SORTS = False
LOG_LISTENER_CONNECTIONS = False
LOG_UPDATES = False
ENABLE_LOGROTATION = False
WHISPER_AUTOFLUSH = False

[cache:1]
LINE_RECEIVER_PORT = 20103
PICKLE_RECEIVER_PORT = 20104
CACHE_QUERY_PORT = 7012

[cache:2]
LINE_RECEIVER_PORT = 20203
PICKLE_RECEIVER_PORT = 20204
CACHE_QUERY_PORT = 7022

[cache:3]
LINE_RECEIVER_PORT = 20303
PICKLE_RECEIVER_PORT = 20304
CACHE_QUERY_PORT = 7032

[cache:4]
LINE_RECEIVER_PORT = 20403
PICKLE_RECEIVER_PORT = 20404
CACHE_QUERY_PORT = 7042

[cache:5]
LINE_RECEIVER_PORT = 20503
PICKLE_RECEIVER_PORT = 20504
CACHE_QUERY_PORT = 7052

[cache:6]
LINE_RECEIVER_PORT = 20603
PICKLE_RECEIVER_PORT = 20604
CACHE_QUERY_PORT = 7062

[cache:7]
LINE_RECEIVER_PORT = 20703
PICKLE_RECEIVER_PORT = 20704
CACHE_QUERY_PORT = 7072

[cache:8]
LINE_RECEIVER_PORT = 20803
PICKLE_RECEIVER_PORT = 20804
CACHE_QUERY_PORT = 7082

[cache:9]
LINE_RECEIVER_PORT = 20903
PICKLE_RECEIVER_PORT = 20904
CACHE_QUERY_PORT = 7092

[cache:10]
LINE_RECEIVER_PORT = 21003
PICKLE_RECEIVER_PORT = 21004
CACHE_QUERY_PORT = 7102

[cache:11]
LINE_RECEIVER_PORT = 21103
PICKLE_RECEIVER_PORT = 21104
CACHE_QUERY_PORT = 7112

[cache:12]
LINE_RECEIVER_PORT = 21203
PICKLE_RECEIVER_PORT = 21204
CACHE_QUERY_PORT = 7122

[cache:13]
LINE_RECEIVER_PORT = 21303
PICKLE_RECEIVER_PORT = 21304
CACHE_QUERY_PORT = 7132

[cache:14]
LINE_RECEIVER_PORT = 21403
PICKLE_RECEIVER_PORT = 21404
CACHE_QUERY_PORT = 7142

[cache:15]
LINE_RECEIVER_PORT = 21503
PICKLE_RECEIVER_PORT = 21504
CACHE_QUERY_PORT = 7152

[cache:16]
LINE_RECEIVER_PORT = 21603
PICKLE_RECEIVER_PORT = 21604
CACHE_QUERY_PORT = 7162
# storage-schemas.conf

[collectd]
pattern = ^collectd\.
retentions = 10s:1w, 60s:1y

[haggar]
pattern = ^haggar\.
retentions = 10s:1d, 60s:1w, 1d:1y

[default]
pattern = .*
retentions = 60s:1y
# haproxy.cfg

global
    log /dev/log    local0
    log /dev/log    local1 notice
    chroot /var/lib/haproxy
    user haproxy
    group haproxy
    daemon
    maxconn 8192
    pidfile /var/run/haproxy.pid

defaults
    balance roundrobin
    log global
    mode    tcp
    retries 3
    option  redispatch
    contimeout 5000
    clitimeout 50000
    srvtimeout 50000

# plaintext listener
listen carbon_relay_2003 0.0.0.0:2003
    server carbon_relay_2113 127.0.0.1:2113 check maxconn 1024
    server carbon_relay_2213 127.0.0.1:2213 check maxconn 1024
    server carbon_relay_2313 127.0.0.1:2313 check maxconn 1024
    server carbon_relay_2413 127.0.0.1:2413 check maxconn 1024
    server carbon_relay_2513 127.0.0.1:2513 check maxconn 1024
    server carbon_relay_2613 127.0.0.1:2613 check maxconn 1024
    server carbon_relay_2713 127.0.0.1:2713 check maxconn 1024
    server carbon_relay_2813 127.0.0.1:2813 check maxconn 1024

# pickle listener
listen carbon_relay_2004 0.0.0.0:2004
    server carbon_relay_2114 127.0.0.1:2114 check maxconn 1024
    server carbon_relay_2214 127.0.0.1:2214 check maxconn 1024
    server carbon_relay_2314 127.0.0.1:2314 check maxconn 1024
    server carbon_relay_2414 127.0.0.1:2414 check maxconn 1024
    server carbon_relay_2514 127.0.0.1:2514 check maxconn 1024
    server carbon_relay_2614 127.0.0.1:2614 check maxconn 1024
    server carbon_relay_2714 127.0.0.1:2714 check maxconn 1024
    server carbon_relay_2814 127.0.0.1:2814 check maxconn 1024
$ GOPATH=~/gocode ./gocode/bin/haggar -agents=300 \
                                      -metrics=2000 \
                                      -carbon="x.x.x.x:2003"
Fig 1: Grafana dashboard for Carbon & Whisper
Fig 2: top output for Carbon

Comments

at 2016-10-07 03:28:09, Space Cowboy wrote in to say...

What are your OS settings? Meaning - have you tuned the disk scheduler? What about tcp_rmem and tcp_wmem. What's your ulimit value?

Are there any other OS tweaks you found important to tweak performance?

Thanks!

-= The Space Cowboy =-

at 2016-11-14 06:59:49, ctrlok wrote in to say...

ALSO, go-carbon has better performance.

And carbon-relay is slow. You may prefer carbon-c-relay. I make presentation about it http://www.slideshare.net/VsevolodPolyakov/metrics-where-and-how?qid=d689f927-2bba-43ea-9703-a6a898285ff1&v=&b=&from_search=2

Add a comment:

  name

  email

  url

max length 4000 chars