commit 2bad6aef10339f000f7cb578108db5ee80bd640c Author: Mark Goodwin <mgoodwin@redhat.com> Date: Wed Jun 9 17:04:33 2021 +1000 pmproxy: add mutex for client req lists, fix https/tls support, QA Add a new mutext to struct proxy and use it to protect parallel multithreaded updates to the proxy->first client list. Also use the same mutext to protect updates to the pending_writes client list and avoid the doubly linked list corruption that was causing parallel https/tls requests to get stuck spinning in flush_secure_module(), as reported in BZ#1947989. qa/1457 is extensively updated to test parallel http, https/tls (and combinations of http and https/tls) RESTAPI calls. Previously it only tested a single https/tls call. With these changes, parallel https/tls RESTAPI requests from the grafana-pcp datasource to pmproxy now work correctly whereas previously pmproxy would hang/spin. Resolves: RHBZ#1947989 - pmproxy hangs and consume 100% cpu if the redis datasource is configured with TLS. Related: https://github.com/performancecopilot/pcp/issues/1311 diff --git a/qa/1457 b/qa/1457 index 94969f6e0..8bf395944 100755 --- a/qa/1457 +++ b/qa/1457 @@ -2,7 +2,7 @@ # PCP QA Test No. 1457 # Exercise HTTPS access to the PMWEBAPI(3). # -# Copyright (c) 2019 Red Hat. +# Copyright (c) 2019,2021 Red Hat. # seq=`basename $0` @@ -138,14 +138,59 @@ else fi date >>$seq.full -echo "=== checking TLS operation ===" | tee -a $seq.full -# (-k) allows us to use self-signed (insecure) certificates, so for testing only -# (-v) provides very detailed TLS connection information, for debugging only -curl -k --get 2>$tmp.err \ - "https://localhost:$port/pmapi/metric?name=sample.long.ten" \ - | _filter_json -cat $tmp.err >>$seq.full +echo "=== checking serial http operation ===" | tee -a $seq.full +for i in 1 2 3 4; do + curl -Gs "http://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$i >$tmp.out$i +done +for i in 1 2 3 4; do +echo === out$i === | tee -a $seq.full +_filter_json < $tmp.out$i +done + +date >>$seq.full +echo "=== checking parallel http operation ===" | tee -a $seq.full +for i in 1 2 3 4; do + curl -Gs "http://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$i >$tmp.out$i & 2>/dev/null eval pid$i=$! +done +wait $pid1 $pid2 $pid3 $pid4 +for i in 1 2 3 4; do +echo === out$i === | tee -a $seq.full +_filter_json < $tmp.out$i +done + +date >>$seq.full +echo "=== checking serial https/TLS operation ===" | tee -a $seq.full +for i in 1 2 3 4; do + curl -k -Gs "https://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$i >$tmp.out$i +done +for i in 1 2 3 4; do +echo === out$i === | tee -a $seq.full +_filter_json < $tmp.out$i +done + date >>$seq.full +echo "=== checking parallel https/TLS operation ===" | tee -a $seq.full +for i in 1 2 3 4; do + curl -k -Gs "https://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$i >$tmp.out$i & 2>/dev/null eval pid$i=$! +done +wait $pid1 $pid2 $pid3 $pid4 +for i in 1 2 3 4; do +echo === out$i === | tee -a $seq.full +_filter_json < $tmp.out$i +done + +date >>$seq.full +echo "=== checking parallel mixed http and https/TLS operations ===" | tee -a $seq.full +for i in 1 3 5 7; do + j=`expr $i + 1` + curl -k -Gs "http://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$i >$tmp.out$i & 2>/dev/null eval pid$i=$! + curl -k -Gs "https://localhost:$port/pmapi/metric?name=sample.long.ten" 2>$tmp.err$j >$tmp.out$j & 2>/dev/null eval pid$j=$! +done +wait $pid1 $pid2 $pid3 $pid4 $pid5 $pid6 $pid7 $pid8 +for i in 1 2 3 4 5 6 7 8; do +echo === out$i === | tee -a $seq.full +_filter_json < $tmp.out$i +done echo "=== check pmproxy is running ===" pminfo -v -h localhost@localhost:$port hinv.ncpu @@ -156,7 +201,7 @@ else fi # valgrind takes awhile to shutdown too -pmsignal $pid +pmsignal $pid >/dev/null 2>&1 pmsleep 3.5 echo "=== valgrind stdout ===" | tee -a $seq.full cat $tmp.valout | _filter_valgrind @@ -164,6 +209,9 @@ cat $tmp.valout | _filter_valgrind echo "=== valgrind stderr ===" | tee -a $seq.full cat $tmp.valerr | _filter_pmproxy_log | _filter_port +# final kill if it's spinning +$sudo kill -9 $pid >/dev/null 2>&1 + # success, all done status=0 exit diff --git a/qa/1457.out b/qa/1457.out index a7b64cdc5..422176db2 100644 --- a/qa/1457.out +++ b/qa/1457.out @@ -1,5 +1,539 @@ QA output created by 1457 -=== checking TLS operation === +=== checking serial http operation === +=== out1 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out2 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out3 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out4 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== checking parallel http operation === +=== out1 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out2 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out3 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out4 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== checking serial https/TLS operation === +=== out1 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out2 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out3 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out4 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== checking parallel https/TLS operation === +=== out1 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out2 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out3 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out4 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== checking parallel mixed http and https/TLS operations === +=== out1 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out2 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out3 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out4 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out5 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out6 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out7 === +{ + "context": "CONTEXT" + "metrics": [ + { + "name": "sample.long.ten", + "series": "SERIES" + "pmid": "29.0.11", + "type": "32", + "sem": "instant", + "units": "none", + "labels": { + "agent": "sample", + "cluster": "zero", + "domainname": "DOMAINNAME" + "hostname": "HOSTNAME" + "role": "testing" + }, + "text-oneline": "10 as a 32-bit integer", + "text-help": "10 as a 32-bit integer" + } + ] +} +=== out8 === { "context": "CONTEXT" "metrics": [ diff --git a/qa/group b/qa/group index 462dffaaa..77cac788d 100644 --- a/qa/group +++ b/qa/group @@ -1818,7 +1818,7 @@ x11 1436 pmda.postgresql local 1437 pmda.kvm local 1455 pmlogrewrite labels pmdumplog local -1457 pmproxy local +1457 pmproxy libpcp_web threads secure local 1480 pmda.lmsensors local 1489 python pmrep pmimport local 1490 python local labels diff --git a/src/pmproxy/src/secure.c b/src/pmproxy/src/secure.c index 77265894c..072e2a085 100644 --- a/src/pmproxy/src/secure.c +++ b/src/pmproxy/src/secure.c @@ -16,13 +16,25 @@ #include <openssl/opensslv.h> #include <openssl/ssl.h> +/* called with proxy->mutex locked */ static void remove_connection_from_queue(struct client *client) { + struct proxy *proxy = client->proxy; + if (client->secure.pending.writes_buffer != NULL) free(client->secure.pending.writes_buffer); - if (client->secure.pending.prev != NULL) - *client->secure.pending.prev = client->secure.pending.next; + if (client->secure.pending.prev == NULL) { + /* next (if any) becomes first in pending_writes list */ + proxy->pending_writes = client->secure.pending.next; + if (proxy->pending_writes) + proxy->pending_writes->secure.pending.prev = NULL; + } + else { + /* link next and prev */ + client->secure.pending.prev->secure.pending.next = client->secure.pending.next; + client->secure.pending.next->secure.pending.prev = client->secure.pending.prev; + } memset(&client->secure.pending, 0, sizeof(client->secure.pending)); } @@ -32,7 +44,9 @@ on_secure_client_close(struct client *client) if (pmDebugOptions.auth || pmDebugOptions.http) fprintf(stderr, "%s: client %p\n", "on_secure_client_close", client); + uv_mutex_lock(&client->proxy->mutex); remove_connection_from_queue(client); + uv_mutex_unlock(&client->proxy->mutex); /* client->read and client->write freed by SSL_free */ SSL_free(client->secure.ssl); } @@ -40,6 +54,8 @@ on_secure_client_close(struct client *client) static void maybe_flush_ssl(struct proxy *proxy, struct client *client) { + struct client *c; + if (client->secure.pending.queued) return; @@ -47,13 +63,19 @@ maybe_flush_ssl(struct proxy *proxy, struct client *client) client->secure.pending.writes_count > 0) return; - client->secure.pending.next = proxy->pending_writes; - if (client->secure.pending.next != NULL) - client->secure.pending.next->secure.pending.prev = &client->secure.pending.next; - client->secure.pending.prev = &proxy->pending_writes; + uv_mutex_lock(&proxy->mutex); + if (proxy->pending_writes == NULL) { + proxy->pending_writes = client; + client->secure.pending.prev = client->secure.pending.next = NULL; + } + else { + for (c=proxy->pending_writes; c->secure.pending.next; c = c->secure.pending.next) + ; /**/ + c->secure.pending.next = client; + client->secure.pending.prev = c; + } client->secure.pending.queued = 1; - - proxy->pending_writes = client; + uv_mutex_unlock(&proxy->mutex); } static void @@ -135,10 +157,12 @@ flush_ssl_buffer(struct client *client) void flush_secure_module(struct proxy *proxy) { - struct client *client, **head = &proxy->pending_writes; + struct client *client, **head; size_t i, used; int sts; + uv_mutex_lock(&proxy->mutex); + head = &proxy->pending_writes; while ((client = *head) != NULL) { flush_ssl_buffer(client); @@ -188,6 +212,7 @@ flush_secure_module(struct proxy *proxy) sizeof(uv_buf_t) * client->secure.pending.writes_count); } } + uv_mutex_unlock(&proxy->mutex); } void diff --git a/src/pmproxy/src/server.c b/src/pmproxy/src/server.c index 612d3613b..b5c5d84de 100644 --- a/src/pmproxy/src/server.c +++ b/src/pmproxy/src/server.c @@ -149,6 +149,7 @@ server_init(int portcount, const char *localpath) pmGetProgname()); return NULL; } + uv_mutex_init(&proxy->mutex); count = portcount + (*localpath ? 1 : 0); if (count) { @@ -251,6 +252,7 @@ void client_put(struct client *client) { unsigned int refcount; + struct proxy *proxy = client->proxy; uv_mutex_lock(&client->mutex); assert(client->refcount); @@ -259,9 +261,11 @@ client_put(struct client *client) if (refcount == 0) { /* remove client from the doubly-linked list */ + uv_mutex_lock(&proxy->mutex); if (client->next != NULL) client->next->prev = client->prev; *client->prev = client->next; + uv_mutex_unlock(&proxy->mutex); if (client->protocol & STREAM_PCP) on_pcp_client_close(client); @@ -514,10 +518,12 @@ on_client_connection(uv_stream_t *stream, int status) client->proxy = proxy; /* insert client into doubly-linked list at the head */ + uv_mutex_lock(&proxy->mutex); if ((client->next = proxy->first) != NULL) proxy->first->prev = &client->next; proxy->first = client; client->prev = &proxy->first; + uv_mutex_unlock(&proxy->mutex); status = uv_read_start((uv_stream_t *)&client->stream.u.tcp, on_buffer_alloc, on_client_read); diff --git a/src/pmproxy/src/server.h b/src/pmproxy/src/server.h index f0b7a5f68..f93daeff4 100644 --- a/src/pmproxy/src/server.h +++ b/src/pmproxy/src/server.h @@ -118,7 +118,7 @@ typedef struct secure_client { BIO *write; struct { struct client *next; - struct client **prev; + struct client *prev; unsigned int queued; size_t writes_count; uv_buf_t *writes_buffer; @@ -166,6 +166,7 @@ typedef struct proxy { struct dict *config; /* configuration dictionary */ uv_loop_t *events; /* global, async event loop */ uv_callback_t write_callbacks; + uv_mutex_t mutex; /* protects client lists and pending writes */ } proxy; extern void proxylog(pmLogLevel, sds, void *);