diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66a4ba8 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +SOURCES/rasdaemon-0.4.1.tar.bz2 diff --git a/.rasdaemon.metadata b/.rasdaemon.metadata new file mode 100644 index 0000000..d5f8ae6 --- /dev/null +++ b/.rasdaemon.metadata @@ -0,0 +1 @@ +ec4e41e454e041b45aa4e11055577cedfa30abef SOURCES/rasdaemon-0.4.1.tar.bz2 diff --git a/README.md b/README.md deleted file mode 100644 index 0e7897f..0000000 --- a/README.md +++ /dev/null @@ -1,5 +0,0 @@ -The master branch has no content - -Look at the c7 branch if you are working with CentOS-7, or the c4/c5/c6 branch for CentOS-4, 5 or 6 - -If you find this file in a distro specific branch, it means that no content has been checked in yet diff --git a/SOURCES/0001-ras-mc-ctl-Improve-error-summary-to-show-label-and-m.patch b/SOURCES/0001-ras-mc-ctl-Improve-error-summary-to-show-label-and-m.patch new file mode 100644 index 0000000..60c0bbd --- /dev/null +++ b/SOURCES/0001-ras-mc-ctl-Improve-error-summary-to-show-label-and-m.patch @@ -0,0 +1,38 @@ +From 5e8fb95e2f6dd3f427e0ae5d7d066aeb6d61fd0f Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@redhat.com> +Date: Wed, 29 May 2013 21:53:58 -0300 +Subject: [PATCH 01/32] ras-mc-ctl: Improve error summary to show label and mc + +Both information are useful for the users, even on summary. + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + util/ras-mc-ctl.in | 6 +++--- + 1 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 32c4edb..5b1ca4d 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -827,15 +827,15 @@ sub summary + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- my $query = "select top_layer,middle_layer,lower_layer, count(*) from mc_event group by top_layer,middle_layer,lower_layer"; ++ my $query = "select label, mc, top_layer,middle_layer,lower_layer, count(*) from mc_event group by label,mc,top_layer,middle_layer,lower_layer"; + my $query_handle = $dbh->prepare($query); + $query_handle->execute(); + +- $query_handle->bind_columns(\my($top, $mid, $low, $count)); ++ $query_handle->bind_columns(\my($label, $mc, $top, $mid, $low, $count)); + + print "Memory controller events summary:\n"; + while($query_handle->fetch()) { +- print "location: $top:$mid:$low errors: $count\n"; ++ print "DIMM Label(s): '$label' location: $mc:$top:$mid:$low errors: $count\n"; + } + + $query_handle->finish; +-- +1.7.1 + diff --git a/SOURCES/0002-ras-record-make-the-code-more-generic.patch b/SOURCES/0002-ras-record-make-the-code-more-generic.patch new file mode 100644 index 0000000..243aa28 --- /dev/null +++ b/SOURCES/0002-ras-record-make-the-code-more-generic.patch @@ -0,0 +1,240 @@ +From 002238dff53b284c9455554f146176ee8de2de4a Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@redhat.com> +Date: Fri, 31 May 2013 12:41:01 -0300 +Subject: [PATCH 02/32] ras-record: make the code more generic + +Now that we're ready to add more tables to the database, make +the code that creates and inserts data into the table more +generic. + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + ras-record.c | 173 +++++++++++++++++++++++++++++++++++++--------------------- + 1 files changed, 110 insertions(+), 63 deletions(-) + +diff --git a/ras-record.c b/ras-record.c +index 8995c9e..3af0791 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -28,80 +28,128 @@ + #include "ras-mc-handler.h" + #include "ras-logger.h" + ++/* #define DEBUG_SQL 1 */ ++ + #define SQLITE_RAS_DB RASSTATEDIR "/" RAS_DB_FNAME + +-const char *mc_event_db = " mc_event "; +-const char *mc_event_db_create_fields = "(" +- "id INTEGER PRIMARY KEY" +- ", timestamp TEXT" +- ", err_count INTEGER" +- ", err_type TEXT" +- ", err_msg TEXT" /* 5 */ +- ", label TEXT" +- ", mc INTEGER" +- ", top_layer INTEGER" +- ", middle_layer INTEGER" +- ", lower_layer INTEGER" /* 10 */ +- ", address INTEGER" +- ", grain INTEGER" +- ", syndrome INTEGER" +- ", driver_detail TEXT" /* 14 */ +- ")"; +- +-const char *mc_event_db_fields = "(" +- "id" +- ", timestamp" +- ", err_count" +- ", err_type" +- ", err_msg" /* 5 */ +- ", label" +- ", mc" +- ", top_layer" +- ", middle_layer" +- ", lower_layer" /* 10 */ +- ", address" +- ", grain" +- ", syndrome" +- ", driver_detail" /* 14 */ +- ")"; +- +-#define NUM_MC_EVENT_DB_VALUES 14 +- +-const char *createdb = "CREATE TABLE IF NOT EXISTS"; ++ ++#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) ++ ++struct db_fields { ++ char *name; ++ char *type; ++}; ++ ++struct db_table_descriptor { ++ char *name; ++ const struct db_fields *fields; ++ size_t num_fields; ++}; ++ ++static const struct db_fields mc_event_fields[] = { ++ { .name="id", .type="INTEGER PRIMARY KEY" }, ++ { .name="timestamp", .type="TEXT" }, ++ { .name="err_count", .type="INTEGER" }, ++ { .name="err_type", .type="TEXT" }, ++ { .name="err_msg", .type="TEXT" }, ++ { .name="label", .type="TEXT" }, ++ { .name="mc", .type="INTEGER" }, ++ { .name="top_layer", .type="INTEGER" }, ++ { .name="middle_layer", .type="INTEGER" }, ++ { .name="lower_layer", .type="INTEGER" }, ++ { .name="address", .type="INTEGER" }, ++ { .name="grain", .type="INTEGER" }, ++ { .name="syndrome", .type="INTEGER" }, ++ { .name="driver_detail", .type="TEXT" }, ++}; ++ ++static const struct db_table_descriptor mc_event_tab = { ++ .name = "mc_event", ++ .fields = mc_event_fields, ++ .num_fields = ARRAY_SIZE(mc_event_fields), ++}; ++ + const char *insertdb = "INSERT INTO"; + const char *valuesdb = " VALUES "; + +-static int ras_mc_prepare_stmt(struct sqlite3_priv *priv) ++static int ras_mc_prepare_stmt(struct sqlite3_priv *priv, ++ sqlite3_stmt **stmt, ++ const struct db_table_descriptor *db_tab) ++ + { + int i, rc; +- char sql[1024]; ++ char sql[1024], *p = sql, *end = sql + sizeof(sql); ++ const struct db_fields *field; ++ ++ p += snprintf(p, end - p, "INSERT INTO %s (", ++ db_tab->name); ++ ++ for (i = 0; i < db_tab->num_fields; i++) { ++ field = &db_tab->fields[i]; ++ p += snprintf(p, end - p, "%s", field->name); ++ ++ if (i < db_tab->num_fields - 1) ++ p += snprintf(p, end - p, ", "); ++ } + +- strcpy(sql, insertdb); +- strcat(sql, mc_event_db); +- strcat(sql, mc_event_db_fields); +- strcat(sql, valuesdb); ++ p += snprintf(p, end - p, ") VALUES ( NULL, "); + +- strcat(sql, "(NULL, "); /* Auto-increment field */ +- for (i = 1; i < NUM_MC_EVENT_DB_VALUES; i++) { +- if (i < NUM_MC_EVENT_DB_VALUES - 1) ++ for (i = 1; i < db_tab->num_fields; i++) { ++ if (i < db_tab->num_fields - 1) + strcat(sql, "?, "); + else + strcat(sql, "?)"); + } + +- rc = sqlite3_prepare_v2(priv->db, sql, -1, &priv->stmt, NULL); ++#ifdef DEBUG_SQL ++ log(TERM, LOG_INFO, "SQL: %s\n", sql); ++#endif ++ ++ rc = sqlite3_prepare_v2(priv->db, sql, -1, stmt, NULL); + if (rc != SQLITE_OK) +- log(TERM, LOG_ERR, "Failed to prepare insert db on %s: error = %s\n", +- SQLITE_RAS_DB, sqlite3_errmsg(priv->db)); ++ log(TERM, LOG_ERR, ++ "Failed to prepare insert db at table %s (db %s): error = %s\n", ++ db_tab->name, SQLITE_RAS_DB, sqlite3_errmsg(priv->db)); + + return rc; + } + ++static int ras_mc_create_table(struct sqlite3_priv *priv, ++ const struct db_table_descriptor *db_tab) ++{ ++ const struct db_fields *field; ++ char sql[1024], *p = sql, *end = sql + sizeof(sql); ++ int i,rc; ++ ++ p += snprintf(p, end - p, "CREATE TABLE IF NOT EXISTS %s (", ++ db_tab->name); ++ ++ for (i = 0; i < db_tab->num_fields; i++) { ++ field = &db_tab->fields[i]; ++ p += snprintf(p, end - p, "%s %s", field->name, field->type); ++ ++ if (i < db_tab->num_fields - 1) ++ p += snprintf(p, end - p, ", "); ++ } ++ p += snprintf(p, end - p, ")"); ++ ++#ifdef DEBUG_SQL ++ log(TERM, LOG_INFO, "SQL: %s\n", sql); ++#endif ++ ++ rc = sqlite3_exec(priv->db, sql, NULL, NULL, NULL); ++ if (rc != SQLITE_OK) { ++ log(TERM, LOG_ERR, ++ "Failed to create table %s on %s: error = %d\n", ++ db_tab->name, SQLITE_RAS_DB, rc); ++ } ++ return rc; ++} ++ + int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + { + int rc; + sqlite3 *db; +- char sql[1024]; + struct sqlite3_priv *priv; + + printf("Calling %s()\n", __FUNCTION__); +@@ -137,27 +185,26 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + free(priv); + return -1; + } ++ priv->db = db; + +- strcpy(sql, createdb); +- strcat(sql, mc_event_db); +- strcat(sql, mc_event_db_create_fields); +- rc = sqlite3_exec(db, sql, NULL, NULL, NULL); ++ rc = ras_mc_create_table(priv, &mc_event_tab); + if (rc != SQLITE_OK) { +- log(TERM, LOG_ERR, +- "cpu %u: Failed to create db on %s: error = %d\n", +- cpu, SQLITE_RAS_DB, rc); ++ sqlite3_close(db); + free(priv); + return -1; + } + +- priv->db = db; +- ras->db_priv = priv; +- +- rc = ras_mc_prepare_stmt(priv); +- if (rc == SQLITE_OK) ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt, &mc_event_tab); ++ if (rc == SQLITE_OK) { + log(TERM, LOG_INFO, + "cpu %u: Recording events at %s\n", + cpu, SQLITE_RAS_DB); ++ ras->db_priv = priv; ++ } else { ++ sqlite3_close(db); ++ free(priv); ++ return -1; ++ } + + return 0; + } +-- +1.7.1 + diff --git a/SOURCES/0003-ras-record-rename-stmt-to-stmt_mc_event.patch b/SOURCES/0003-ras-record-rename-stmt-to-stmt_mc_event.patch new file mode 100644 index 0000000..f72a1a0 --- /dev/null +++ b/SOURCES/0003-ras-record-rename-stmt-to-stmt_mc_event.patch @@ -0,0 +1,97 @@ +From 016802f4093e80971a52c590c661a04924cb9aa3 Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@redhat.com> +Date: Fri, 31 May 2013 13:10:16 -0300 +Subject: [PATCH 03/32] ras-record: rename stmt to stmt_mc_event + +This stmt is used only for mc_event. So, rename it, as we'll be +adding other stmts for the other tables. + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + ras-record.c | 46 ++++++++++++++++++++++++---------------------- + ras-record.h | 2 +- + 2 files changed, 25 insertions(+), 23 deletions(-) + +diff --git a/ras-record.c b/ras-record.c +index 3af0791..efcd78f 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -194,7 +194,7 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + return -1; + } + +- rc = ras_mc_prepare_stmt(priv, &priv->stmt, &mc_event_tab); ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event, &mc_event_tab); + if (rc == SQLITE_OK) { + log(TERM, LOG_INFO, + "cpu %u: Recording events at %s\n", +@@ -214,30 +214,32 @@ int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) + int rc; + struct sqlite3_priv *priv = ras->db_priv; + +- if (!priv || !priv->stmt) ++ if (!priv || !priv->stmt_mc_event) + return 0; +- log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt); +- +- sqlite3_bind_text(priv->stmt, 1, ev->timestamp, -1, NULL); +- sqlite3_bind_int (priv->stmt, 2, ev->error_count); +- sqlite3_bind_text(priv->stmt, 3, ev->error_type, -1, NULL); +- sqlite3_bind_text(priv->stmt, 4, ev->msg, -1, NULL); +- sqlite3_bind_text(priv->stmt, 5, ev->label, -1, NULL); +- sqlite3_bind_int (priv->stmt, 6, ev->mc_index); +- sqlite3_bind_int (priv->stmt, 7, ev->top_layer); +- sqlite3_bind_int (priv->stmt, 8, ev->middle_layer); +- sqlite3_bind_int (priv->stmt, 9, ev->lower_layer); +- sqlite3_bind_int (priv->stmt, 10, ev->address); +- sqlite3_bind_int (priv->stmt, 11, ev->grain); +- sqlite3_bind_int (priv->stmt, 12, ev->syndrome); +- sqlite3_bind_text(priv->stmt, 13, ev->driver_detail, -1, NULL); +- rc = sqlite3_step(priv->stmt); ++ log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt_mc_event); ++ ++ sqlite3_bind_text(priv->stmt_mc_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_int (priv->stmt_mc_event, 2, ev->error_count); ++ sqlite3_bind_text(priv->stmt_mc_event, 3, ev->error_type, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mc_event, 4, ev->msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mc_event, 5, ev->label, -1, NULL); ++ sqlite3_bind_int (priv->stmt_mc_event, 6, ev->mc_index); ++ sqlite3_bind_int (priv->stmt_mc_event, 7, ev->top_layer); ++ sqlite3_bind_int (priv->stmt_mc_event, 8, ev->middle_layer); ++ sqlite3_bind_int (priv->stmt_mc_event, 9, ev->lower_layer); ++ sqlite3_bind_int (priv->stmt_mc_event, 10, ev->address); ++ sqlite3_bind_int (priv->stmt_mc_event, 11, ev->grain); ++ sqlite3_bind_int (priv->stmt_mc_event, 12, ev->syndrome); ++ sqlite3_bind_text(priv->stmt_mc_event, 13, ev->driver_detail, -1, NULL); ++ rc = sqlite3_step(priv->stmt_mc_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) +- log(TERM, LOG_ERR, "Failed to do mc_event step on sqlite: error = %d\n", rc); +- rc = sqlite3_reset(priv->stmt); ++ log(TERM, LOG_ERR, ++ "Failed to do mc_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_mc_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) +- log(TERM, LOG_ERR, "Failed reset mc_event on sqlite: error = %d\n", +- rc); ++ log(TERM, LOG_ERR, ++ "Failed reset mc_event on sqlite: error = %d\n", ++ rc); + log(TERM, LOG_INFO, "register inserted at db\n"); + + return rc; +diff --git a/ras-record.h b/ras-record.h +index 20c327f..9791185 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -46,7 +46,7 @@ struct ras_aer_event { + + struct sqlite3_priv { + sqlite3 *db; +- sqlite3_stmt *stmt; ++ sqlite3_stmt *stmt_mc_event; + }; + + int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras); +-- +1.7.1 + diff --git a/SOURCES/0004-ras-record-reorder-functions.patch b/SOURCES/0004-ras-record-reorder-functions.patch new file mode 100644 index 0000000..4e6f58b --- /dev/null +++ b/SOURCES/0004-ras-record-reorder-functions.patch @@ -0,0 +1,114 @@ +From 4474f696c9207ceb21d55a0047ab6871879afe5a Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@redhat.com> +Date: Fri, 31 May 2013 13:51:55 -0300 +Subject: [PATCH 04/32] ras-record: reorder functions + +No functional changes + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + ras-record.c | 77 +++++++++++++++++++++++++++++---------------------------- + 1 files changed, 39 insertions(+), 38 deletions(-) + +diff --git a/ras-record.c b/ras-record.c +index efcd78f..298977e 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -46,6 +46,10 @@ struct db_table_descriptor { + size_t num_fields; + }; + ++/* ++ * Table and functions to handle ras:mc_event ++ */ ++ + static const struct db_fields mc_event_fields[] = { + { .name="id", .type="INTEGER PRIMARY KEY" }, + { .name="timestamp", .type="TEXT" }, +@@ -69,8 +73,41 @@ static const struct db_table_descriptor mc_event_tab = { + .num_fields = ARRAY_SIZE(mc_event_fields), + }; + +-const char *insertdb = "INSERT INTO"; +-const char *valuesdb = " VALUES "; ++int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_mc_event) ++ return 0; ++ log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt_mc_event); ++ ++ sqlite3_bind_text(priv->stmt_mc_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_int (priv->stmt_mc_event, 2, ev->error_count); ++ sqlite3_bind_text(priv->stmt_mc_event, 3, ev->error_type, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mc_event, 4, ev->msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mc_event, 5, ev->label, -1, NULL); ++ sqlite3_bind_int (priv->stmt_mc_event, 6, ev->mc_index); ++ sqlite3_bind_int (priv->stmt_mc_event, 7, ev->top_layer); ++ sqlite3_bind_int (priv->stmt_mc_event, 8, ev->middle_layer); ++ sqlite3_bind_int (priv->stmt_mc_event, 9, ev->lower_layer); ++ sqlite3_bind_int (priv->stmt_mc_event, 10, ev->address); ++ sqlite3_bind_int (priv->stmt_mc_event, 11, ev->grain); ++ sqlite3_bind_int (priv->stmt_mc_event, 12, ev->syndrome); ++ sqlite3_bind_text(priv->stmt_mc_event, 13, ev->driver_detail, -1, NULL); ++ rc = sqlite3_step(priv->stmt_mc_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do mc_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_mc_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset mc_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + + static int ras_mc_prepare_stmt(struct sqlite3_priv *priv, + sqlite3_stmt **stmt, +@@ -208,39 +245,3 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + + return 0; + } +- +-int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) +-{ +- int rc; +- struct sqlite3_priv *priv = ras->db_priv; +- +- if (!priv || !priv->stmt_mc_event) +- return 0; +- log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt_mc_event); +- +- sqlite3_bind_text(priv->stmt_mc_event, 1, ev->timestamp, -1, NULL); +- sqlite3_bind_int (priv->stmt_mc_event, 2, ev->error_count); +- sqlite3_bind_text(priv->stmt_mc_event, 3, ev->error_type, -1, NULL); +- sqlite3_bind_text(priv->stmt_mc_event, 4, ev->msg, -1, NULL); +- sqlite3_bind_text(priv->stmt_mc_event, 5, ev->label, -1, NULL); +- sqlite3_bind_int (priv->stmt_mc_event, 6, ev->mc_index); +- sqlite3_bind_int (priv->stmt_mc_event, 7, ev->top_layer); +- sqlite3_bind_int (priv->stmt_mc_event, 8, ev->middle_layer); +- sqlite3_bind_int (priv->stmt_mc_event, 9, ev->lower_layer); +- sqlite3_bind_int (priv->stmt_mc_event, 10, ev->address); +- sqlite3_bind_int (priv->stmt_mc_event, 11, ev->grain); +- sqlite3_bind_int (priv->stmt_mc_event, 12, ev->syndrome); +- sqlite3_bind_text(priv->stmt_mc_event, 13, ev->driver_detail, -1, NULL); +- rc = sqlite3_step(priv->stmt_mc_event); +- if (rc != SQLITE_OK && rc != SQLITE_DONE) +- log(TERM, LOG_ERR, +- "Failed to do mc_event step on sqlite: error = %d\n", rc); +- rc = sqlite3_reset(priv->stmt_mc_event); +- if (rc != SQLITE_OK && rc != SQLITE_DONE) +- log(TERM, LOG_ERR, +- "Failed reset mc_event on sqlite: error = %d\n", +- rc); +- log(TERM, LOG_INFO, "register inserted at db\n"); +- +- return rc; +-} +-- +1.7.1 + diff --git a/SOURCES/0005-ras-record-Make-the-code-easier-to-add-support-for-o.patch b/SOURCES/0005-ras-record-Make-the-code-easier-to-add-support-for-o.patch new file mode 100644 index 0000000..ec8c437 --- /dev/null +++ b/SOURCES/0005-ras-record-Make-the-code-easier-to-add-support-for-o.patch @@ -0,0 +1,60 @@ +From 93217061a4b1dc7f287f2715aadc621d2c00425d Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@redhat.com> +Date: Fri, 31 May 2013 13:53:18 -0300 +Subject: [PATCH 05/32] ras-record: Make the code easier to add support for other tables + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + ras-record.c | 25 ++++++++----------------- + 1 files changed, 8 insertions(+), 17 deletions(-) + +diff --git a/ras-record.c b/ras-record.c +index 298977e..36b3373 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -143,10 +143,14 @@ static int ras_mc_prepare_stmt(struct sqlite3_priv *priv, + #endif + + rc = sqlite3_prepare_v2(priv->db, sql, -1, stmt, NULL); +- if (rc != SQLITE_OK) ++ if (rc != SQLITE_OK) { + log(TERM, LOG_ERR, + "Failed to prepare insert db at table %s (db %s): error = %s\n", + db_tab->name, SQLITE_RAS_DB, sqlite3_errmsg(priv->db)); ++ stmt = NULL; ++ } else { ++ log(TERM, LOG_INFO, "Recording %s events\n", db_tab->name); ++ } + + return rc; + } +@@ -225,23 +229,10 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + priv->db = db; + + rc = ras_mc_create_table(priv, &mc_event_tab); +- if (rc != SQLITE_OK) { +- sqlite3_close(db); +- free(priv); +- return -1; +- } ++ if (rc == SQLITE_OK) ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event, &mc_event_tab); + +- rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event, &mc_event_tab); +- if (rc == SQLITE_OK) { +- log(TERM, LOG_INFO, +- "cpu %u: Recording events at %s\n", +- cpu, SQLITE_RAS_DB); +- ras->db_priv = priv; +- } else { +- sqlite3_close(db); +- free(priv); +- return -1; +- } + ++ ras->db_priv = priv; + return 0; + } +-- +1.7.1 + diff --git a/SOURCES/0006-Add-support-to-record-AER-events.patch b/SOURCES/0006-Add-support-to-record-AER-events.patch new file mode 100644 index 0000000..5d5ab87 --- /dev/null +++ b/SOURCES/0006-Add-support-to-record-AER-events.patch @@ -0,0 +1,141 @@ +From 11004aaa98865dd7c0ee28b4af8d6ba6b6f11507 Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@redhat.com> +Date: Fri, 31 May 2013 13:54:11 -0300 +Subject: [PATCH 06/32] Add support to record AER events + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + ras-aer-handler.c | 4 ++- + ras-record.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++- + ras-record.h | 6 +++++ + 3 files changed, 68 insertions(+), 2 deletions(-) + +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index ec63e2a..e5abaca 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -111,7 +111,9 @@ int ras_aer_event_handler(struct trace_seq *s, + trace_seq_puts(s, ev.error_type); + + /* Insert data into the SGBD */ +-// ras_store_aer_event(ras, &ev); ++#ifdef HAVE_SQLITE3 ++ ras_store_aer_event(ras, &ev); ++#endif + + return 0; + } +diff --git a/ras-record.c b/ras-record.c +index 36b3373..cb302ce 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -26,6 +26,7 @@ + #include <unistd.h> + #include "ras-events.h" + #include "ras-mc-handler.h" ++#include "ras-aer-handler.h" + #include "ras-logger.h" + + /* #define DEBUG_SQL 1 */ +@@ -109,6 +110,56 @@ int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) + return rc; + } + ++/* ++ * Table and functions to handle ras:aer ++ */ ++ ++#ifdef HAVE_AER ++static const struct db_fields aer_event_fields[] = { ++ { .name="id", .type="INTEGER PRIMARY KEY" }, ++ { .name="timestamp", .type="TEXT" }, ++ { .name="err_type", .type="TEXT" }, ++ { .name="err_msg", .type="TEXT" }, ++}; ++ ++static const struct db_table_descriptor aer_event_tab = { ++ .name = "aer_event", ++ .fields = aer_event_fields, ++ .num_fields = ARRAY_SIZE(aer_event_fields), ++}; ++ ++int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_aer_event) ++ return 0; ++ log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt_aer_event); ++ ++ sqlite3_bind_text(priv->stmt_aer_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_text(priv->stmt_aer_event, 3, ev->error_type, -1, NULL); ++ sqlite3_bind_text(priv->stmt_aer_event, 4, ev->msg, -1, NULL); ++ ++ rc = sqlite3_step(priv->stmt_aer_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do aer_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_aer_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset aer_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif ++ ++/* ++ * Generic code ++ */ ++ + static int ras_mc_prepare_stmt(struct sqlite3_priv *priv, + sqlite3_stmt **stmt, + const struct db_table_descriptor *db_tab) +@@ -230,8 +281,15 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + + rc = ras_mc_create_table(priv, &mc_event_tab); + if (rc == SQLITE_OK) +- rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event, &mc_event_tab); ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event, ++ &mc_event_tab); + ++#ifdef HAVE_AER ++ rc = ras_mc_create_table(priv, &aer_event_tab); ++ if (rc == SQLITE_OK) ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_aer_event, ++ &aer_event_tab); ++#endif + + ras->db_priv = priv; + return 0; +diff --git a/ras-record.h b/ras-record.h +index 9791185..5008906 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -47,14 +47,20 @@ struct ras_aer_event { + struct sqlite3_priv { + sqlite3 *db; + sqlite3_stmt *stmt_mc_event; ++#ifdef HAVE_AER ++ sqlite3_stmt *stmt_aer_event; ++#endif + }; + + int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras); + int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev); ++int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; + static inline int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; }; ++static inline int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; }; ++ + #endif + + #endif +-- +1.7.1 + diff --git a/SOURCES/0007-Add-support-to-store-MCE-events-at-the-database.patch b/SOURCES/0007-Add-support-to-store-MCE-events-at-the-database.patch new file mode 100644 index 0000000..8caf335 --- /dev/null +++ b/SOURCES/0007-Add-support-to-store-MCE-events-at-the-database.patch @@ -0,0 +1,202 @@ +From 0a31d938cf29e065e96de1206a7d35042962e02a Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@redhat.com> +Date: Fri, 31 May 2013 14:18:24 -0300 +Subject: [PATCH 07/32] Add support to store MCE events at the database + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + ras-mce-handler.c | 5 +++ + ras-record.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++- + ras-record.h | 9 +++++ + 3 files changed, 116 insertions(+), 2 deletions(-) + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 614a0eb..59e8d05 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -396,5 +396,10 @@ int ras_mce_event_handler(struct trace_seq *s, + return rc; + + report_mce_event(ras, record, s, &e); ++ ++#ifdef HAVE_SQLITE3 ++ ras_store_mce_record(ras, &e); ++#endif ++ + return 0; + } +diff --git a/ras-record.c b/ras-record.c +index cb302ce..daa3cb1 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -27,6 +27,7 @@ + #include "ras-events.h" + #include "ras-mc-handler.h" + #include "ras-aer-handler.h" ++#include "ras-mce-handler.h" + #include "ras-logger.h" + + /* #define DEBUG_SQL 1 */ +@@ -135,7 +136,7 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) + + if (!priv || !priv->stmt_aer_event) + return 0; +- log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt_aer_event); ++ log(TERM, LOG_INFO, "aer_event store: %p\n", priv->stmt_aer_event); + + sqlite3_bind_text(priv->stmt_aer_event, 1, ev->timestamp, -1, NULL); + sqlite3_bind_text(priv->stmt_aer_event, 3, ev->error_type, -1, NULL); +@@ -156,6 +157,98 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) + } + #endif + ++ ++/* ++ * Table and functions to handle mce:mce_record ++ */ ++ ++#ifdef HAVE_MCE ++static const struct db_fields mce_record_fields[] = { ++ { .name="id", .type="INTEGER PRIMARY KEY" }, ++ { .name="timestamp", .type="TEXT" }, ++ ++ /* MCE registers */ ++ { .name="mcgcap", .type="INTEGER" }, ++ { .name="mcgstatus", .type="INTEGER" }, ++ { .name="status", .type="INTEGER" }, ++ { .name="addr", .type="INTEGER" }, // 5 ++ { .name="misc", .type="INTEGER" }, ++ { .name="ip", .type="INTEGER" }, ++ { .name="tsc", .type="INTEGER" }, ++ { .name="walltime", .type="INTEGER" }, ++ { .name="cpu", .type="INTEGER" }, // 10 ++ { .name="cpuid", .type="INTEGER" }, ++ { .name="apicid", .type="INTEGER" }, ++ { .name="socketid", .type="INTEGER" }, ++ { .name="cs", .type="INTEGER" }, ++ { .name="bank", .type="INTEGER" }, //15 ++ { .name="cpuvendor", .type="INTEGER" }, ++ ++ /* Parsed data - will likely change */ ++ { .name="bank_name", .type="TEXT" }, ++ { .name="error_msg", .type="TEXT" }, ++ { .name="mcgstatus_msg", .type="TEXT" }, ++ { .name="mcistatus_msg", .type="TEXT" }, // 20 ++ { .name="user_action", .type="TEXT" }, ++ { .name="mc_location", .type="TEXT" }, ++}; ++ ++static const struct db_table_descriptor mce_record_tab = { ++ .name = "mce_record", ++ .fields = mce_record_fields, ++ .num_fields = ARRAY_SIZE(mce_record_fields), ++}; ++ ++int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_mce_record) ++ return 0; ++ log(TERM, LOG_INFO, "mce_record store: %p\n", priv->stmt_mce_record); ++ ++ sqlite3_bind_text(priv->stmt_mce_record, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_int (priv->stmt_mce_record, 2, ev->mcgcap); ++ sqlite3_bind_int (priv->stmt_mce_record, 3, ev->mcgstatus); ++ sqlite3_bind_int (priv->stmt_mce_record, 4, ev->status); ++ sqlite3_bind_int (priv->stmt_mce_record, 5, ev->addr); ++ sqlite3_bind_int (priv->stmt_mce_record, 6, ev->misc); ++ sqlite3_bind_int (priv->stmt_mce_record, 7, ev->ip); ++ sqlite3_bind_int (priv->stmt_mce_record, 8, ev->tsc); ++ sqlite3_bind_int (priv->stmt_mce_record, 9, ev->walltime); ++ sqlite3_bind_int (priv->stmt_mce_record, 10, ev->cpu); ++ sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpuid); ++ sqlite3_bind_int (priv->stmt_mce_record, 12, ev->apicid); ++ sqlite3_bind_int (priv->stmt_mce_record, 13, ev->socketid); ++ sqlite3_bind_int (priv->stmt_mce_record, 14, ev->cs); ++ sqlite3_bind_int (priv->stmt_mce_record, 15, ev->bank); ++ sqlite3_bind_int (priv->stmt_mce_record, 16, ev->cpuvendor); ++ ++ sqlite3_bind_text(priv->stmt_mce_record, 17, ev->bank_name, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 18, ev->error_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 19, ev->mcgstatus_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcistatus_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcastatus_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 22, ev->user_action, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 23, ev->mc_location, -1, NULL); ++ ++ rc = sqlite3_step(priv->stmt_mce_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do mce_record step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_mce_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset mce_record on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif ++ ++ + /* + * Generic code + */ +@@ -291,6 +384,13 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + &aer_event_tab); + #endif + +- ras->db_priv = priv; ++#ifdef HAVE_MCE ++ rc = ras_mc_create_table(priv, &mce_record_tab); ++ if (rc == SQLITE_OK) ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mce_record, ++ &mce_record_tab); ++#endif ++ ++ ras->db_priv = priv; + return 0; + } +diff --git a/ras-record.h b/ras-record.h +index 5008906..6f146a8 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -40,6 +40,10 @@ struct ras_aer_event { + const char *msg; + }; + ++struct ras_mc_event; ++struct ras_aer_event; ++struct mce_event; ++ + #ifdef HAVE_SQLITE3 + + #include <sqlite3.h> +@@ -50,16 +54,21 @@ struct sqlite3_priv { + #ifdef HAVE_AER + sqlite3_stmt *stmt_aer_event; + #endif ++#ifdef HAVE_MCE ++ sqlite3_stmt *stmt_mce_record; ++#endif + }; + + int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras); + int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev); + int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev); ++int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; + static inline int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; }; + static inline int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; }; ++static inline int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) { return 0; }; + + #endif + +-- +1.7.1 + diff --git a/SOURCES/0008-ras-mc-ctl-add-summary-for-MCE-and-PCIe-AER-errors.patch b/SOURCES/0008-ras-mc-ctl-add-summary-for-MCE-and-PCIe-AER-errors.patch new file mode 100644 index 0000000..18d1dad --- /dev/null +++ b/SOURCES/0008-ras-mc-ctl-add-summary-for-MCE-and-PCIe-AER-errors.patch @@ -0,0 +1,85 @@ +From 2925cc92d73065dab3bbf7de83404d6e0e141dc6 Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@redhat.com> +Date: Fri, 31 May 2013 14:57:54 -0300 +Subject: [PATCH 08/32] ras-mc-ctl: add summary for MCE and PCIe AER errors + +Report the summary also for MCE and PCIe errors. + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + util/ras-mc-ctl.in | 50 ++++++++++++++++++++++++++++++++++++++++++++------ + 1 files changed, 44 insertions(+), 6 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 5b1ca4d..118af7b 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -824,21 +824,59 @@ sub find_prog + sub summary + { + require DBI; ++ my ($query, $query_handle, $out); ++ my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- my $query = "select label, mc, top_layer,middle_layer,lower_layer, count(*) from mc_event group by label,mc,top_layer,middle_layer,lower_layer"; +- my $query_handle = $dbh->prepare($query); ++ # Memory controller mc_event errors ++ $query = "select err_type, label, mc, top_layer,middle_layer,lower_layer, count(*) from mc_event group by err_type, label, mc, top_layer, middle_layer, lower_layer"; ++ $query_handle = $dbh->prepare($query); + $query_handle->execute(); ++ $query_handle->bind_columns(\($err_type, $label, $mc, $top, $mid, $low, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$err_type on DIMM Label(s): '$label' location: $mc:$top:$mid:$low errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "Memory controller events summary:\n$out\n"; ++ } else { ++ print "No Memory errors.\n\n"; ++ } ++ $query_handle->finish; + +- $query_handle->bind_columns(\my($label, $mc, $top, $mid, $low, $count)); +- +- print "Memory controller events summary:\n"; ++ # PCIe AER aer_event errors ++ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($err_type, $msg, $count)); ++ $out = ""; + while($query_handle->fetch()) { +- print "DIMM Label(s): '$label' location: $mc:$top:$mid:$low errors: $count\n"; ++ $out .= "\t$count $err_type errors: $msg\n"; + } ++ if ($out ne "") { ++ print "PCIe AER events summary:\n$out\n"; ++ } else { ++ print "No PCIe AER errors.\n\n"; ++ } ++ $query_handle->finish; + ++ # MCE mce_record errors ++ $query = "select error_msg, count(*) from mce_record group by error_msg"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($msg, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$count $msg errors\n"; ++ } ++ if ($out ne "") { ++ print "MCE records summary:\n$out"; ++ } else { ++ print "No MCE errors.\n"; ++ } + $query_handle->finish; ++ + undef($dbh); + } + +-- +1.7.1 + diff --git a/SOURCES/0009-ras-mc-ctl-report-errors-also-for-PCIe-AER-and-MCE.patch b/SOURCES/0009-ras-mc-ctl-report-errors-also-for-PCIe-AER-and-MCE.patch new file mode 100644 index 0000000..fda9b46 --- /dev/null +++ b/SOURCES/0009-ras-mc-ctl-report-errors-also-for-PCIe-AER-and-MCE.patch @@ -0,0 +1,108 @@ +From 4b64649eb5740027f58377f6c29d1554d9792b97 Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@redhat.com> +Date: Fri, 31 May 2013 16:16:44 -0300 +Subject: [PATCH 09/32] ras-mc-ctl: report errors also for PCIe AER and MCE + +Show also PCIe AER and MCE when used with --errors parameter. + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + util/ras-mc-ctl.in | 73 +++++++++++++++++++++++++++++++++++++++++++++++----- + 1 files changed, 66 insertions(+), 7 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 118af7b..30d3078 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -883,22 +883,81 @@ sub summary + sub errors + { + require DBI; ++ my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); ++ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +- my $query = "select id, timestamp, err_count, err_type, err_msg, label, mc, top_layer,middle_layer,lower_layer, address, grain, syndrome, driver_detail from mc_event order by id"; +- +- my $query_handle = $dbh->prepare($query); ++ # Memory controller mc_event errors ++ $query = "select id, timestamp, err_count, err_type, err_msg, label, mc, top_layer,middle_layer,lower_layer, address, grain, syndrome, driver_detail from mc_event order by id"; ++ $query_handle = $dbh->prepare($query); + $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $time $count $type error(s): $msg at $label location: $mc:$top:$mid:$low, addr $addr, grain $grain, syndrome $syndrome $detail\n"; ++ } ++ if ($out ne "") { ++ print "PCIe AER events:\n$out\n"; ++ } else { ++ print "No PCIe AER errors.\n\n"; ++ } ++ $query_handle->finish; + +- $query_handle->bind_columns(\my($id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail)); +- +- print "Memory controller events:\n"; ++ # PCIe AER aer_event errors ++ $query = "select id, timestamp, err_type, err_msg from aer_event order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $time, $type, $msg)); ++ $out = ""; + while($query_handle->fetch()) { +- print "$id $time $count $type error(s): $msg at $label location: $mc:$top:$mid:$low, addr $addr, grain $grain, syndrome $syndrome $detail\n"; ++ $out .= "$id $time $type error: $msg\n"; + } ++ if ($out ne "") { ++ print "MCE events:\n$out\n"; ++ } else { ++ print "No MCE errors.\n\n"; ++ } ++ $query_handle->finish; + ++ # MCE mce_record errors ++ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $time error: $msg"; ++ $out .= ", CPU $cpuvendor" if ($cpuvendor); ++ $out .= ", bank $bank_name" if ($bank_name); ++ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); ++ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); ++ $out .= ", $mc_location" if ($mc_location); ++ $out .= ", $user_action" if ($user_action); ++ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); ++ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus); ++ $out .= sprintf ", status=0x%08x", $status if ($status); ++ $out .= sprintf ", addr=0x%08x", $addr if ($addr); ++ $out .= sprintf ", misc=0x%08x", $misc if ($misc); ++ $out .= sprintf ", ip=0x%08x", $ip if ($ip); ++ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc); ++ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime); ++ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu); ++ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid); ++ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid); ++ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid); ++ $out .= sprintf ", cs=0x%08x", $cs if ($cs); ++ $out .= sprintf ", bank=0x%08x", $bank if ($bank); ++ ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "Memory controller events:\n$out\n"; ++ } else { ++ print "No Memory errors.\n\n"; ++ } + $query_handle->finish; ++ + undef($dbh); + } + +-- +1.7.1 + diff --git a/SOURCES/0010-ras-mc-ctl-Fix-the-name-of-the-error-table-data.patch b/SOURCES/0010-ras-mc-ctl-Fix-the-name-of-the-error-table-data.patch new file mode 100644 index 0000000..5500798 --- /dev/null +++ b/SOURCES/0010-ras-mc-ctl-Fix-the-name-of-the-error-table-data.patch @@ -0,0 +1,53 @@ +From dc811f88b1bd5ac33faa1606c3a3ce4d3bc0b7ed Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@redhat.com> +Date: Fri, 31 May 2013 16:40:40 -0300 +Subject: [PATCH 10/32] ras-mc-ctl: Fix the name of the error table data + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + util/ras-mc-ctl.in | 12 ++++++------ + 1 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 30d3078..48d9b00 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -898,9 +898,9 @@ sub errors + $out .= "$id $time $count $type error(s): $msg at $label location: $mc:$top:$mid:$low, addr $addr, grain $grain, syndrome $syndrome $detail\n"; + } + if ($out ne "") { +- print "PCIe AER events:\n$out\n"; ++ print "Memory controller events:\n$out\n"; + } else { +- print "No PCIe AER errors.\n\n"; ++ print "No Memory errors.\n\n"; + } + $query_handle->finish; + +@@ -914,9 +914,9 @@ sub errors + $out .= "$id $time $type error: $msg\n"; + } + if ($out ne "") { +- print "MCE events:\n$out\n"; ++ print "PCIe AER events:\n$out\n"; + } else { +- print "No MCE errors.\n\n"; ++ print "No PCIe AER errors.\n\n"; + } + $query_handle->finish; + +@@ -952,9 +952,9 @@ sub errors + $out .= "\n"; + } + if ($out ne "") { +- print "Memory controller events:\n$out\n"; ++ print "MCE events:\n$out\n"; + } else { +- print "No Memory errors.\n\n"; ++ print "No MCE errors.\n\n"; + } + $query_handle->finish; + +-- +1.7.1 + diff --git a/SOURCES/0013-ras-mc-ctl-Improve-parser.patch b/SOURCES/0013-ras-mc-ctl-Improve-parser.patch new file mode 100644 index 0000000..7900f18 --- /dev/null +++ b/SOURCES/0013-ras-mc-ctl-Improve-parser.patch @@ -0,0 +1,36 @@ +From 099af4056912faa28bf1385fffa77e7bbb468b93 Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <m.chehab@samsung.com> +Date: Thu, 15 Aug 2013 12:43:02 -0300 +Subject: [PATCH 13/32] ras-mc-ctl: Improve parser + +Accept either . or : as layers separator at config files. + +Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> +--- + util/ras-mc-ctl.in | 4 ++-- + 1 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 48d9b00..f5a8ce5 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -481,14 +481,14 @@ sub parse_dimm_labels_file + + next unless (my ($label, $info) = ($str =~ /^(.*)\s*:\s*(.*)$/i)); + +- unless ($info =~ /\d+(?:\.\d+)*/) { ++ unless ($info =~ /\d+(?:[\.\:]\d+)*/) { + log_error ("$file: $line: Invalid syntax, ignoring: \"$_\"\n"); + next; + } + + for my $target (split (/[, ]+/, $info)) { + my $n; +- my ($mc, $top, $mid, $low, $extra) = ($target =~ /(\d+)(?:\.(\d+)){0,1}(?:\.(\d+)){0,1}(?:\.(\d+)){0,1}(?:\.(\d+)){0,1}/); ++ my ($mc, $top, $mid, $low, $extra) = ($target =~ /(\d+)(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}/); + + if (defined($extra)) { + die ("Error: Only up to 3 layers are currently supported on label db \"$file\"\n"); +-- +1.7.1 + diff --git a/SOURCES/0014-ras-mc-ctl-Fix-label-register-with-2-layers.patch b/SOURCES/0014-ras-mc-ctl-Fix-label-register-with-2-layers.patch new file mode 100644 index 0000000..6274324 --- /dev/null +++ b/SOURCES/0014-ras-mc-ctl-Fix-label-register-with-2-layers.patch @@ -0,0 +1,77 @@ +From 0d53728f9cbdca5a1bd32c51a121dd1162f50e95 Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <m.chehab@samsung.com> +Date: Thu, 15 Aug 2013 12:45:18 -0300 +Subject: [PATCH 14/32] ras-mc-ctl: Fix label register with 2 layers + +When there aren't 3 layers, label print/register weren't working. + +Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> +--- + util/ras-mc-ctl.in | 19 +++++++++++++------ + 1 files changed, 13 insertions(+), 6 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index f5a8ce5..a7137be 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -508,7 +508,6 @@ sub parse_dimm_labels_file + } + map { $lh->{$vendor}{lc $_}{$mc}{$top}{$mid}{$low} = $label } + @models; +- $n = 3; + } + if (!$num) { + $num = $n; +@@ -542,9 +541,13 @@ sub parse_dimm_labels + + sub read_dimm_label + { +- my ($mc, $top, $mid, $low) = @_; ++ my ($num_layers, $mc, $top, $mid, $low) = @_; + my $sysfs = "/sys/devices/system/edac/mc"; +- my $pos = "$mc:$top:$mid:$low"; ++ my $pos; ++ ++ $pos = "$mc:$top:$mid:$low" if ($num_layers == 3); ++ $pos = "$mc:$top:$mid" if ($num_layers == 2); ++ $pos = "$mc:$top" if ($num_layers == 1); + + if (!defined($dimm_node{$pos})) { + my $label = "$pos missing"; +@@ -574,10 +577,14 @@ sub read_dimm_label + + sub get_dimm_label_node + { +- my ($mc, $top, $mid, $low) = @_; ++ my ($num_layers, $mc, $top, $mid, $low) = @_; + my $sysfs = "/sys/devices/system/edac/mc"; + my $pos = "$mc:$top:$mid:$low"; + ++ $pos = "$mc:$top:$mid:$low" if ($num_layers == 3); ++ $pos = "$mc:$top:$mid" if ($num_layers == 2); ++ $pos = "$mc:$top" if ($num_layers == 1); ++ + return "" if (!defined($dimm_node{$pos})); + + my $dimm = $dimm_node{$pos}; +@@ -611,7 +618,7 @@ sub print_dimm_labels + for my $mid (sort keys %{$$lref{$vendor}{$model}{$mc}{$top}}) { + for my $low (sort keys %{$$lref{$vendor}{$model}{$mc}{$top}{$mid}}) { + my $label = $$lref{$vendor}{$model}{$mc}{$top}{$mid}{$low}; +- my ($rlabel,$loc) = read_dimm_label ($mc, $top, $mid, $low); ++ my ($rlabel,$loc) = read_dimm_label ($$num_layers{$vendor}{$model}, $mc, $top, $mid, $low); + + printf $fh $format, $loc, $label, $rlabel; + } +@@ -645,7 +652,7 @@ sub register_dimm_labels + for my $mid (sort keys %{$$lref{$vendor}{$model}{$mc}{$top}}) { + for my $low (sort keys %{$$lref{$vendor}{$model}{$mc}{$top}{$mid}}) { + +- my $file = get_dimm_label_node($mc, $top, $mid, $low); ++ my $file = get_dimm_label_node($$num_layers{$vendor}{$model}, $mc, $top, $mid, $low); + + # Ignore sysfs files that don't exist. Might just be + # unpopulated bank. +-- +1.7.1 + diff --git a/SOURCES/0015-Add-an-example-of-labels-file.patch b/SOURCES/0015-Add-an-example-of-labels-file.patch new file mode 100644 index 0000000..f7e64b7 --- /dev/null +++ b/SOURCES/0015-Add-an-example-of-labels-file.patch @@ -0,0 +1,44 @@ +From 74d84ba18f4f1d7097b47ce1c2e41e332d197dfb Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <m.chehab@samsung.com> +Date: Thu, 15 Aug 2013 12:58:02 -0300 +Subject: [PATCH 15/32] Add an example of labels file + +This is an example of a labels file for a Dell Power Edge T620. + +For now, only DIMMs A1 and B1 are tested here. + +Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> +--- + labels/dell | 20 ++++++++++++++++++++ + 1 files changed, 20 insertions(+), 0 deletions(-) + create mode 100644 labels/dell + +diff --git a/labels/dell b/labels/dell +new file mode 100644 +index 0000000..e1a09a7 +--- /dev/null ++++ b/labels/dell +@@ -0,0 +1,20 @@ ++# RASDAEMON Motherboard DIMM labels Database file. ++# ++# Vendor-name and model-name are found from the program 'dmidecode' ++# labels are found from the silk screen on the motherboard. ++# ++#Vendor: <vendor-name> ++# Model: <model-name> ++# <label>: <mc>.<top>.<mid>.<low> ++# ++ ++Vendor: Dell Inc. ++ ++ Model: 0F5XM3 ++ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2; DIMM_A4: 0.0.3; ++ DIMM_A5: 0.1.0; DIMM_A6: 0.1.1; DIMM_A7: 0.1.2; DIMM_A8: 0.1.3; ++ DIMM_A9: 0.2.0; DIMM_A10: 0.2.1; DIMM_A11: 0.2.2; DIMM_A12: 0.2.3; ++ ++ DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2; DIMM_B4: 1.0.3; ++ DIMM_B5: 1.1.0; DIMM_B6: 1.1.1; DIMM_B7: 1.1.2; DIMM_B8: 1.1.3; ++ DIMM_B9: 1.2.0; DIMM_B10: 1.2.1; DIMM_B11: 1.2.2; DIMM_B12: 1.2.3; +-- +1.7.1 + diff --git a/SOURCES/0017-ras-mc-ctl-Fix-the-DIMM-layout-display.patch b/SOURCES/0017-ras-mc-ctl-Fix-the-DIMM-layout-display.patch new file mode 100644 index 0000000..b9a57d3 --- /dev/null +++ b/SOURCES/0017-ras-mc-ctl-Fix-the-DIMM-layout-display.patch @@ -0,0 +1,76 @@ +From b8bb2ed4a751516d32373e478e5c9ea9f16b524d Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <m.chehab@samsung.com> +Date: Thu, 15 Aug 2013 17:13:43 -0300 +Subject: [PATCH 17/32] ras-mc-ctl: Fix the DIMM layout display + +The items weren't being presented at the right order. Fix it. + +Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com> +--- + util/ras-mc-ctl.in | 26 ++++++++++++++++++++------ + 1 files changed, 20 insertions(+), 6 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index a7137be..196a643 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -673,15 +673,15 @@ sub register_dimm_labels + return 1; + } + +-sub dimm_display_layer($@); ++sub dimm_display_layer_rev($@); + +-sub dimm_display_layer($@) ++sub dimm_display_layer_rev($@) + { + my $layer = shift; + my @pos = @_; + +- $layer--; +- if ($layer < 0) { ++ $layer++; ++ if ($layer >= scalar(@pos) - 1) { + my $str_loc = join(':', @pos); + my $size = $dimm_size{$str_loc}; + if (!$size) { +@@ -695,12 +695,26 @@ sub dimm_display_layer($@) + my $s; + for (my $i = 0; $i <= $max_pos[$layer]; $i++) { + $pos[$layer] = $i; +- $s .= dimm_display_layer($layer, @pos); ++ $s .= dimm_display_layer_rev($layer, @pos); + } + + return $s; + } + ++sub dimm_display_layer(@) ++{ ++ my @pos = @_; ++ ++ my $s; ++ for (my $i = 0; $i <= $max_pos[0]; $i++) { ++ $pos[0] = $i; ++ $s .= dimm_display_layer_rev(0, @pos); ++ } ++ ++ return $s; ++} ++ ++ + sub dimm_display_layer_header($$) + { + my $n_items = 1; +@@ -753,7 +767,7 @@ sub dimm_display_mem() + my $p1 = length($s) - 1; + + $pos[scalar(@pos) - 1] = $d; +- $s .= dimm_display_layer(scalar(@pos) - 1, @pos); ++ $s .= dimm_display_layer(@pos); + $len += length($s); + + $sep = "-" x $p1; +-- +1.7.1 + diff --git a/SOURCES/0019-ras-mc-ctl-remove-completely-use-of-modprobe.patch b/SOURCES/0019-ras-mc-ctl-remove-completely-use-of-modprobe.patch new file mode 100644 index 0000000..c65a242 --- /dev/null +++ b/SOURCES/0019-ras-mc-ctl-remove-completely-use-of-modprobe.patch @@ -0,0 +1,29 @@ +From 2afbcd81173822014d6d73e98e9093a140bb1421 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Fri, 6 Dec 2013 09:45:14 -0500 +Subject: [PATCH 19/32] ras-mc-ctl: remove completely use of modprobe + +While verifying SELinux policies, this popped up. ras-mc-ctl inherited a +modprobe lookup that ends up never being used. This patch gets rid of +it. + +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +--- + util/ras-mc-ctl.in | 1 - + 1 files changed, 0 insertions(+), 1 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 196a643..ef0d9bc 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -39,7 +39,6 @@ my $dbname = "@RASSTATEDIR@/@RAS_DB_FNAME@"; + my $prefix = "@prefix@"; + my $sysconfdir = "@sysconfdir@"; + my $dmidecode = find_prog ("dmidecode"); +-my $modprobe = find_prog ("modprobe") or exit (1); + + my %conf = (); + my %bus = (); +-- +1.7.1 + diff --git a/SOURCES/0022-mce-amd-k8.c-fix-a-warning.patch b/SOURCES/0022-mce-amd-k8.c-fix-a-warning.patch new file mode 100644 index 0000000..7ac6e58 --- /dev/null +++ b/SOURCES/0022-mce-amd-k8.c-fix-a-warning.patch @@ -0,0 +1,42 @@ +From 78465e5047b226011c1a4c916c79c63fb6e68f71 Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <m.chehab@samsung.com> +Date: Fri, 14 Feb 2014 05:11:26 +0900 +Subject: [PATCH 22/32] mce-amd-k8.c: fix a warning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +mce-amd-k8.c: In function ‘bank_name’: +mce-amd-k8.c:250:22: warning: argument to ‘sizeof’ in ‘snprintf’ call is the same expression as the destination; did you mean to provide an explicit length? [-Wsizeof-pointer-memaccess] + snprintf(buf, sizeof(buf), "%s (bank=%d)", s, e->bank); + ^ + +Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> +--- + mce-amd-k8.c | 3 +-- + 1 files changed, 1 insertions(+), 2 deletions(-) + +diff --git a/mce-amd-k8.c b/mce-amd-k8.c +index 5e21b55..8179f74 100644 +--- a/mce-amd-k8.c ++++ b/mce-amd-k8.c +@@ -236,7 +236,6 @@ static void decode_k8_threashold(struct mce_event *e) + + static void bank_name(struct mce_event *e) + { +- char *buf = e->bank_name; + const char *s; + + if (e->bank < ARRAY_SIZE(k8bank)) +@@ -247,7 +246,7 @@ static void bank_name(struct mce_event *e) + else + return; /* Use the generic parser for bank */ + +- snprintf(buf, sizeof(buf), "%s (bank=%d)", s, e->bank); ++ mce_snprintf(e->bank_name, "%s (bank=%d)", s, e->bank); + } + + int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e) +-- +1.7.1 + diff --git a/SOURCES/0023-add-abrt-suppport-for-rasdaemon.patch b/SOURCES/0023-add-abrt-suppport-for-rasdaemon.patch new file mode 100644 index 0000000..e5ea589 --- /dev/null +++ b/SOURCES/0023-add-abrt-suppport-for-rasdaemon.patch @@ -0,0 +1,641 @@ +From c6ed1e1af9356cdce1eaa652061dd6e4eb32d283 Mon Sep 17 00:00:00 2001 +From: Junliang Li <lijunliang.dna@gmail.com> +Date: Thu, 13 Feb 2014 10:39:53 +0800 +Subject: [PATCH 23/32] add abrt suppport for rasdaemon + +Adds abrt as another error mechanism for the rasdaemon. +This patch does: + +1) read ras event (mc,mce and aer) + +2) setup a abrt-server unix socket + +3) write messages follow ABRT server protocol, set event + info into backtrace zone. + +4) commit report. + +For now, it depends on ABRT to limit flood reports. + +Signed-off-by: Junliang Li <lijunliang.dna@gmail.com> +Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> +--- + Makefile.am | 5 +- + configure.ac | 9 + + ras-aer-handler.c | 6 + + ras-events.h | 3 + + ras-mc-handler.c | 7 + + ras-mce-handler.c | 6 + + ras-report.c | 429 +++++++++++++++++++++++++++++++++++++++++++++++++++++ + ras-report.h | 39 +++++ + 8 files changed, 503 insertions(+), 1 deletions(-) + create mode 100644 ras-report.c + create mode 100644 ras-report.h + +diff --git a/Makefile.am b/Makefile.am +index 473ce98..c1668b4 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -17,10 +17,13 @@ if WITH_MCE + mce-intel-dunnington.c mce-intel-tulsa.c \ + mce-intel-sb.c mce-intel-ivb.c + endif ++if WITH_ABRT_REPORT ++ rasdaemon_SOURCES += ras-report.c ++endif + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a + + include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ +- ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ++ ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 4fe6ef2..0ea962e 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -53,6 +53,15 @@ AS_IF([test "x$enable_mce" = "xyes"], [ + ]) + AM_CONDITIONAL([WITH_MCE], [test x$enable_mce = xyes]) + ++AC_ARG_ENABLE([abrt_report], ++ AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)])) ++ ++AS_IF([test "x$enable_abrt_report" = "xyes"], [ ++ AC_DEFINE(HAVE_ABRT_REPORT,1,"have report event to ABRT") ++ AC_SUBST([WITH_ABRT_REPORT]) ++]) ++AM_CONDITIONAL([WITH_ABRT_REPORT], [test x$enable_abrt_report = xyes]) ++ + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc + + CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index e5abaca..50526af 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -24,6 +24,7 @@ + #include "ras-record.h" + #include "ras-logger.h" + #include "bitfield.h" ++#include "ras-report.h" + + static const char *aer_errors[32] = { + /* Correctable errors */ +@@ -115,5 +116,10 @@ int ras_aer_event_handler(struct trace_seq *s, + ras_store_aer_event(ras, &ev); + #endif + ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_aer_event(ras, &ev); ++#endif ++ + return 0; + } +diff --git a/ras-events.h b/ras-events.h +index 554a95e..64e045a 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -47,6 +47,9 @@ struct ras_events { + + /* For the mce handler */ + struct mce_priv *mce_priv; ++ ++ /* For ABRT socket*/ ++ int socketfd; + }; + + struct pthread_data { +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index 5c24f65..ffb3805 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -23,6 +23,7 @@ + #include "ras-mc-handler.h" + #include "ras-record.h" + #include "ras-logger.h" ++#include "ras-report.h" + + int ras_mc_event_handler(struct trace_seq *s, + struct pevent_record *record, +@@ -189,6 +190,12 @@ int ras_mc_event_handler(struct trace_seq *s, + /* Insert data into the SGBD */ + + ras_store_mc_event(ras, &ev); ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_mc_event(ras, &ev); ++#endif ++ + return 0; + + parse_error: +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 59e8d05..1431049 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -26,6 +26,7 @@ + #include "ras-mce-handler.h" + #include "ras-record.h" + #include "ras-logger.h" ++#include "ras-report.h" + + /* + * The code below were adapted from Andi Kleen/Intel/SuSe mcelog code, +@@ -401,5 +402,10 @@ int ras_mce_event_handler(struct trace_seq *s, + ras_store_mce_record(ras, &e); + #endif + ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_mce_event(ras, &e); ++#endif ++ + return 0; + } +diff --git a/ras-report.c b/ras-report.c +new file mode 100644 +index 0000000..d3e4a79 +--- /dev/null ++++ b/ras-report.c +@@ -0,0 +1,429 @@ ++#include <stdio.h> ++#include <string.h> ++#include <unistd.h> ++#include <sys/types.h> ++#include <sys/utsname.h> ++#include <sys/socket.h> ++#include <sys/un.h> ++ ++#include "ras-report.h" ++ ++static int setup_report_socket(void){ ++ int sockfd = -1; ++ int rc = -1; ++ struct sockaddr_un addr; ++ ++ sockfd = socket(AF_UNIX, SOCK_STREAM, 0); ++ if (sockfd < 0){ ++ return -1; ++ } ++ ++ memset(&addr, 0, sizeof(struct sockaddr_un)); ++ addr.sun_family = AF_UNIX; ++ strncpy(addr.sun_path, ABRT_SOCKET, strlen(ABRT_SOCKET)); ++ ++ rc = connect(sockfd, (struct sockaddr *)&addr, sizeof(struct sockaddr_un)); ++ if (rc < 0){ ++ return -1; ++ } ++ ++ return sockfd; ++} ++ ++static int commit_report_basic(int sockfd){ ++ char buf[INPUT_BUFFER_SIZE]; ++ struct utsname un; ++ int rc = -1; ++ ++ if(sockfd < 0){ ++ return rc; ++ } ++ ++ memset(buf, 0, INPUT_BUFFER_SIZE); ++ memset(&un, 0, sizeof(struct utsname)); ++ ++ rc = uname(&un); ++ if(rc < 0){ ++ return rc; ++ } ++ ++ /* ++ * ABRT server protocol ++ */ ++ sprintf(buf, "PUT / HTTP/1.1\r\n\r\n"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ return -1; ++ } ++ ++ sprintf(buf, "PID=%d", (int)getpid()); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ return -1; ++ } ++ ++ sprintf(buf, "EXECUTABLE=/boot/vmlinuz-%s", un.release); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ return -1; ++ } ++ ++ sprintf(buf, "BASENAME=%s", "rasdaemon"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ return -1; ++ } ++ ++ return 0; ++} ++ ++/* ++ * add "DONE" string to finish message. ++ */ ++static int commit_report_done(int sockfd){ ++ int rc = -1; ++ ++ if(sockfd < 0){ ++ return -1; ++ } ++ ++ rc = write(sockfd, "DONE\0", strlen("DONE\0")); ++ if(rc < strlen("DONE\0")){ ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int set_mc_event_backtrace(char *buf, struct ras_mc_event *ev){ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if(!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE= " \ ++ "timestamp=%s\n" \ ++ "error_count=%d\n" \ ++ "error_type=%s\n" \ ++ "msg=%s\n" \ ++ "label=%s\n" \ ++ "mc_index=%c\n" \ ++ "top_layer=%c\n" \ ++ "middle_layer=%c\n" \ ++ "lower_layer=%c\n" \ ++ "address=%llu\n" \ ++ "grain=%llu\n" \ ++ "syndrome=%llu\n" \ ++ "driver_detail=%s\n", \ ++ ev->timestamp, \ ++ ev->error_count, \ ++ ev->error_type, \ ++ ev->msg, \ ++ ev->label, \ ++ ev->mc_index, \ ++ ev->top_layer, \ ++ ev->middle_layer, \ ++ ev->lower_layer, \ ++ ev->address, \ ++ ev->grain, \ ++ ev->syndrome, \ ++ ev->driver_detail); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ ++static int set_mce_event_backtrace(char *buf, struct mce_event *ev){ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if(!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "bank_name=%s\n" \ ++ "error_msg=%s\n" \ ++ "mcgstatus_msg=%s\n" \ ++ "mcistatus_msg=%s\n" \ ++ "mcastatus_msg=%s\n" \ ++ "user_action=%s\n" \ ++ "mc_location=%s\n" \ ++ "mcgcap=%lu\n" \ ++ "mcgstatus=%lu\n" \ ++ "status=%lu\n" \ ++ "addr=%lu\n" \ ++ "misc=%lu\n" \ ++ "ip=%lu\n" \ ++ "tsc=%lu\n" \ ++ "walltime=%lu\n" \ ++ "cpu=%u\n" \ ++ "cpuid=%u\n" \ ++ "apicid=%u\n" \ ++ "socketid=%u\n" \ ++ "cs=%d\n" \ ++ "bank=%d\n" \ ++ "cpuvendor=%d\n", \ ++ ev->timestamp, \ ++ ev->bank_name, \ ++ ev->error_msg, \ ++ ev->mcgstatus_msg, \ ++ ev->mcistatus_msg, \ ++ ev->mcastatus_msg, \ ++ ev->user_action, \ ++ ev->mc_location, \ ++ ev->mcgcap, \ ++ ev->mcgstatus, \ ++ ev->status, \ ++ ev->addr, \ ++ ev->misc, \ ++ ev->ip, \ ++ ev->tsc, \ ++ ev->walltime, \ ++ ev->cpu, \ ++ ev->cpuid, \ ++ ev->apicid, \ ++ ev->socketid, \ ++ ev->cs, \ ++ ev->bank, \ ++ ev->cpuvendor); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ ++static int set_aer_event_backtrace(char *buf, struct ras_aer_event *ev){ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if(!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "error_type=%s\n" \ ++ "dev_name=%s\n" \ ++ "msg=%s\n", \ ++ ev->timestamp, \ ++ ev->error_type, \ ++ ev->dev_name, \ ++ ev->msg); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ ++static int commit_report_backtrace(int sockfd, int type, void *ev){ ++ char buf[MAX_BACKTRACE_SIZE]; ++ char *pbuf = buf; ++ int rc = -1; ++ int buf_len = 0; ++ ++ if(sockfd < 0 || !ev){ ++ return -1; ++ } ++ ++ memset(buf, 0, MAX_BACKTRACE_SIZE); ++ ++ switch(type){ ++ case MC_EVENT: ++ rc = set_mc_event_backtrace(buf, (struct ras_mc_event *)ev); ++ break; ++ case AER_EVENT: ++ rc = set_aer_event_backtrace(buf, (struct ras_aer_event *)ev); ++ break; ++ case MCE_EVENT: ++ rc = set_mce_event_backtrace(buf, (struct mce_event *)ev); ++ break; ++ default: ++ return -1; ++ } ++ ++ if(rc < 0){ ++ return -1; ++ } ++ ++ buf_len = strlen(buf); ++ ++ for(;buf_len > INPUT_BUFFER_SIZE - 1; buf_len -= (INPUT_BUFFER_SIZE - 1)){ ++ rc = write(sockfd, pbuf, INPUT_BUFFER_SIZE - 1); ++ if(rc < INPUT_BUFFER_SIZE - 1){ ++ return -1; ++ } ++ ++ pbuf = pbuf + INPUT_BUFFER_SIZE - 1; ++ } ++ ++ rc = write(sockfd, pbuf, buf_len + 1); ++ if(rc < buf_len){ ++ return -1; ++ } ++ ++ return 0; ++} ++ ++int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = -1; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if(sockfd < 0){ ++ return -1; ++ } ++ ++ rc = commit_report_basic(sockfd); ++ if(rc < 0){ ++ goto mc_fail; ++ } ++ ++ rc = commit_report_backtrace(sockfd, MC_EVENT, ev); ++ if(rc < 0){ ++ goto mc_fail; ++ } ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-mc"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ goto mc_fail; ++ } ++ ++ sprintf(buf, "REASON=%s", "EDAC driver report problem"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ goto mc_fail; ++ } ++ ++ rc = commit_report_done(sockfd); ++ if(rc < 0){ ++ goto mc_fail; ++ } ++ ++ done = 1; ++ ++mc_fail: ++ ++ if(sockfd > 0){ ++ close(sockfd); ++ } ++ ++ if(done){ ++ return 0; ++ }else{ ++ return -1; ++ } ++} ++ ++int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if(sockfd < 0){ ++ return -1; ++ } ++ ++ rc = commit_report_basic(sockfd); ++ if(rc < 0){ ++ goto aer_fail; ++ } ++ ++ rc = commit_report_backtrace(sockfd, AER_EVENT, ev); ++ if(rc < 0){ ++ goto aer_fail; ++ } ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-aer"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ goto aer_fail; ++ } ++ ++ sprintf(buf, "REASON=%s", "PCIe AER driver report problem"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ goto aer_fail; ++ } ++ ++ rc = commit_report_done(sockfd); ++ if(rc < 0){ ++ goto aer_fail; ++ } ++ ++ done = 1; ++ ++aer_fail: ++ ++ if(sockfd > 0){ ++ close(sockfd); ++ } ++ ++ if(done){ ++ return 0; ++ }else{ ++ return -1; ++ } ++} ++ ++int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if(sockfd < 0){ ++ return -1; ++ } ++ ++ rc = commit_report_basic(sockfd); ++ if(rc < 0){ ++ goto mce_fail; ++ } ++ ++ rc = commit_report_backtrace(sockfd, MCE_EVENT, ev); ++ if(rc < 0){ ++ goto mce_fail; ++ } ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-mce"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ goto mce_fail; ++ } ++ ++ sprintf(buf, "REASON=%s", "Machine Check driver report problem"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ goto mce_fail; ++ } ++ ++ rc = commit_report_done(sockfd); ++ if(rc < 0){ ++ goto mce_fail; ++ } ++ ++ done = 1; ++ ++mce_fail: ++ ++ if(sockfd > 0){ ++ close(sockfd); ++ } ++ ++ if(done){ ++ return 0; ++ }else{ ++ return -1; ++ } ++} +diff --git a/ras-report.h b/ras-report.h +new file mode 100644 +index 0000000..7920cdf +--- /dev/null ++++ b/ras-report.h +@@ -0,0 +1,39 @@ ++#ifndef __RAS_REPORT_H ++#define __RAS_REPORT_H ++ ++#include "ras-record.h" ++#include "ras-events.h" ++#include "ras-mc-handler.h" ++#include "ras-mce-handler.h" ++#include "ras-aer-handler.h" ++ ++/* Maximal length of backtrace. */ ++#define MAX_BACKTRACE_SIZE (1024*1024) ++/* Amount of data received from one client for a message before reporting error. */ ++#define MAX_MESSAGE_SIZE (4*MAX_BACKTRACE_SIZE) ++/* Maximal number of characters read from socket at once. */ ++#define INPUT_BUFFER_SIZE (8*1024) ++/* ABRT socket file */ ++#define ABRT_SOCKET "/var/run/abrt/abrt.socket" ++ ++enum { ++ MC_EVENT, ++ MCE_EVENT, ++ AER_EVENT ++}; ++ ++#ifdef HAVE_ABRT_REPORT ++ ++int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev); ++int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev); ++int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev); ++ ++#else ++ ++static inline int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; }; ++static inline int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; }; ++static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; }; ++ ++#endif ++ ++#endif +-- +1.7.1 + diff --git a/SOURCES/0026-rasdaemon-Add-record-option-to-rasdaemon-man-page.patch b/SOURCES/0026-rasdaemon-Add-record-option-to-rasdaemon-man-page.patch new file mode 100644 index 0000000..711819b --- /dev/null +++ b/SOURCES/0026-rasdaemon-Add-record-option-to-rasdaemon-man-page.patch @@ -0,0 +1,50 @@ +From d1b81490639f2608ecaf8fa50c24ac78c053fc2b Mon Sep 17 00:00:00 2001 +From: Betty Dall <betty.dall@hp.com> +Date: Wed, 19 Mar 2014 14:59:47 -0600 +Subject: [PATCH 26/32] rasdaemon: Add record option to rasdaemon man page + +Add the already existing rasdaemon option 'record' to the rasdaemon man +page. This option records events via sqlite3. + +Signed-off-by: Betty Dall <betty.dall@hp.com> +Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> +--- + man/rasdaemon.1.in | 14 +++++++++++--- + 1 files changed, 11 insertions(+), 3 deletions(-) + +diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in +index 5349fa3..7a8b60f 100644 +--- a/man/rasdaemon.1.in ++++ b/man/rasdaemon.1.in +@@ -29,8 +29,10 @@ rasdaemon \- RAS daemon to log the RAS events. + + .SH DESCRIPTION + +-The \fBrasdaemon\fR program is a daemon with monitors the RAS trace events +-from /sys/kernel/debug/tracing, reporting them via syslog/journald. ++The \fBrasdaemon\fR program is a daemon which monitors the platform ++Reliablity, Availability and Serviceability (RAS) reports from the ++Linux kernel trace events. These trace events are logged in ++/sys/kernel/debug/tracing, reporting them via syslog/journald. + + .SH OPTIONS + .TP +@@ -51,8 +53,14 @@ Executes in foreground, printing the events at console. Useful for testing it, + and to be used by systemd or Unix System V respan. + If not specified, the program runs in daemon mode. + .TP ++.BI "--record" ++Record RAS events via Sqlite3. The Sqlite3 database has the benefit of ++keeping a persistent record of the RAS events. This feature is used with ++the ras-mc-ctl utility. Note that rasdaemon may be compiled without this ++feature. ++.TP + .BI "--version" +-Prints the program version and exit. ++Print the program version and exit. + + .SH SEE ALSO + \fBras-mc-ctl\fR(8) +-- +1.7.1 + diff --git a/SOURCES/0027-ras-mc-ctl-Print-useful-message-when-run-without-ras.patch b/SOURCES/0027-ras-mc-ctl-Print-useful-message-when-run-without-ras.patch new file mode 100644 index 0000000..73be031 --- /dev/null +++ b/SOURCES/0027-ras-mc-ctl-Print-useful-message-when-run-without-ras.patch @@ -0,0 +1,45 @@ +From caa44c3946ddc900896830297c28b90ce5b9034b Mon Sep 17 00:00:00 2001 +From: Betty Dall <betty.dall@hp.com> +Date: Wed, 19 Mar 2014 15:54:56 -0600 +Subject: [PATCH 27/32] ras-mc-ctl: Print useful message when run without rasdaemon -r + +The utility script ras-mc-ctl requires that rasdaemon --record be run +to create the me_event table in the SQLite database. The current behaviour +is this: +[root@sa1 util]# ras-mc-ctl --errors +DBD::SQLite::db prepare failed: no such table: mc_event at +/usr/local/sbin/ras-mc-ctl line 914. +Can't call method "execute" on an undefined value at +/usr/local/sbin/ras-mc-ctl line 915. + +With this change, the user sees: +[root@sa1 util]# ras-mc-ctl --errors +DBD::SQLite::db prepare failed: no such table: mc_event at +/usr/local/sbin/ras-mc-ctl line 914. +ras-mc-ctl: Error: mc_event table missing from +/usr/local/var/lib/rasdaemon/ras-mc_event.db. Run 'rasdaemon --record'. + +Signed-off-by: Betty Dall <betty.dall@hp.com> +Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> +--- + util/ras-mc-ctl.in | 4 ++++ + 1 files changed, 4 insertions(+), 0 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 196a643..e9f9c59 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -912,6 +912,10 @@ sub errors + # Memory controller mc_event errors + $query = "select id, timestamp, err_count, err_type, err_msg, label, mc, top_layer,middle_layer,lower_layer, address, grain, syndrome, driver_detail from mc_event order by id"; + $query_handle = $dbh->prepare($query); ++ if (!$query_handle) { ++ log_error ("mc_event table missing from $dbname. Run 'rasdaemon --record'.\n"); ++ exit -1 ++ } + $query_handle->execute(); + $query_handle->bind_columns(\($id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail)); + $out = ""; +-- +1.7.1 + diff --git a/SOURCES/0028-Make-paths-in-the-systemd-services-configurable.patch b/SOURCES/0028-Make-paths-in-the-systemd-services-configurable.patch new file mode 100644 index 0000000..f751929 --- /dev/null +++ b/SOURCES/0028-Make-paths-in-the-systemd-services-configurable.patch @@ -0,0 +1,114 @@ +From 4bfa45f56e1500f1cfc8de3fd8d1228b11011e95 Mon Sep 17 00:00:00 2001 +From: Jakub Filak <jfilak@redhat.com> +Date: Fri, 21 Feb 2014 15:54:09 +0100 +Subject: [PATCH 28/32] Make paths in the systemd services configurable + +The path to a binary depends on configuration, therefore it is better to +not use hard coded strings. + +Signed-off-by: Jakub Filak <jfilak@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> +--- + Makefile.am | 15 ++++++++++++++- + misc/ras-mc-ctl.service | 10 ---------- + misc/ras-mc-ctl.service.in | 10 ++++++++++ + misc/rasdaemon.service | 10 ---------- + misc/rasdaemon.service.in | 10 ++++++++++ + 5 files changed, 34 insertions(+), 21 deletions(-) + delete mode 100644 misc/ras-mc-ctl.service + create mode 100644 misc/ras-mc-ctl.service.in + delete mode 100644 misc/rasdaemon.service + create mode 100644 misc/rasdaemon.service.in + +diff --git a/Makefile.am b/Makefile.am +index c1668b4..0fa615f 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -1,6 +1,19 @@ + ACLOCAL_AMFLAGS=-I m4 + SUBDIRS = libtrace util man +-EXTRA_DIST = misc/rasdaemon.service misc/ras-mc-ctl.service ++SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in ++SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service) ++EXTRA_DIST = $(SYSTEMD_SERVICES_IN) ++ ++# This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin ++# during ./configure phase, therefore it is not possible to add .service.in ++# files to AC_CONFIG_FILES in configure.ac ++SUFFIXES = .service.in .service ++.service.in.service: ++ sed -e s,\@sbindir\@,$(sbindir),g $< > $@ ++ ++# This rule is needed because the service files must be generated on target ++# system after ./configure phase ++all-local: $(SYSTEMD_SERVICES) + + sbin_PROGRAMS = rasdaemon + rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \ +diff --git a/misc/ras-mc-ctl.service b/misc/ras-mc-ctl.service +deleted file mode 100644 +index 8a09508..0000000 +--- a/misc/ras-mc-ctl.service ++++ /dev/null +@@ -1,10 +0,0 @@ +-[Unit] +-Description=Initialize EDAC v3.0.0 Drivers For Machine Hardware +- +-[Service] +-Type=oneshot +-ExecStart=/usr/sbin/ras-mc-ctl --register-labels +-RemainAfterExit=yes +- +-[Install] +-WantedBy=multi-user.target +diff --git a/misc/ras-mc-ctl.service.in b/misc/ras-mc-ctl.service.in +new file mode 100644 +index 0000000..8cb3651 +--- /dev/null ++++ b/misc/ras-mc-ctl.service.in +@@ -0,0 +1,10 @@ ++[Unit] ++Description=Initialize EDAC v3.0.0 Drivers For Machine Hardware ++ ++[Service] ++Type=oneshot ++ExecStart=@sbindir@/ras-mc-ctl --register-labels ++RemainAfterExit=yes ++ ++[Install] ++WantedBy=multi-user.target +diff --git a/misc/rasdaemon.service b/misc/rasdaemon.service +deleted file mode 100644 +index 36cdef5..0000000 +--- a/misc/rasdaemon.service ++++ /dev/null +@@ -1,10 +0,0 @@ +-[Unit] +-Description=RAS daemon to log the RAS events +-After=syslog.target +- +-[Service] +-ExecStart=/usr/local/sbin/rasdaemon -f +-Restart=on-abort +- +-[Install] +-WantedBy=multi-user.target +diff --git a/misc/rasdaemon.service.in b/misc/rasdaemon.service.in +new file mode 100644 +index 0000000..5e1f375 +--- /dev/null ++++ b/misc/rasdaemon.service.in +@@ -0,0 +1,10 @@ ++[Unit] ++Description=RAS daemon to log the RAS events ++After=syslog.target ++ ++[Service] ++ExecStart=@sbindir@/rasdaemon -f ++Restart=on-abort ++ ++[Install] ++WantedBy=multi-user.target +-- +1.7.1 + diff --git a/SOURCES/0031-Correct-ABRT-report-data.patch b/SOURCES/0031-Correct-ABRT-report-data.patch new file mode 100644 index 0000000..fabb7ac --- /dev/null +++ b/SOURCES/0031-Correct-ABRT-report-data.patch @@ -0,0 +1,118 @@ +From d7453479e96693ebb5e17b285adf915b67095aad Mon Sep 17 00:00:00 2001 +From: Jakub Filak <jfilak@redhat.com> +Date: Wed, 2 Apr 2014 15:03:44 +0200 +Subject: [PATCH 31/32] Correct ABRT report data + +Remove '\0' byte from 'PUT' message because this was superfluous. + +Replaced 'BASENAME' item with 'TYPE' item because the first one is no +longer supported by abrtd and the second one is required. Basically the +later is a substitute for the first one. + +Removed the closing message which is not supported by abrtd. abrtd +considers that message as a part of the problem report. + +Removed a superfluous space from 'Backtrace'. + +Signed-off-by: Jakub Filak <jfilak@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> +--- + ras-report.c | 41 ++++------------------------------------- + 1 files changed, 4 insertions(+), 37 deletions(-) + +diff --git a/ras-report.c b/ras-report.c +index d3e4a79..0a05732 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -51,8 +51,8 @@ static int commit_report_basic(int sockfd){ + * ABRT server protocol + */ + sprintf(buf, "PUT / HTTP/1.1\r\n\r\n"); +- rc = write(sockfd, buf, strlen(buf) + 1); +- if(rc < strlen(buf) + 1){ ++ rc = write(sockfd, buf, strlen(buf)); ++ if(rc < strlen(buf)){ + return -1; + } + +@@ -68,7 +68,7 @@ static int commit_report_basic(int sockfd){ + return -1; + } + +- sprintf(buf, "BASENAME=%s", "rasdaemon"); ++ sprintf(buf, "TYPE=%s", "ras"); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + return -1; +@@ -77,31 +77,13 @@ static int commit_report_basic(int sockfd){ + return 0; + } + +-/* +- * add "DONE" string to finish message. +- */ +-static int commit_report_done(int sockfd){ +- int rc = -1; +- +- if(sockfd < 0){ +- return -1; +- } +- +- rc = write(sockfd, "DONE\0", strlen("DONE\0")); +- if(rc < strlen("DONE\0")){ +- return -1; +- } +- +- return 0; +-} +- + static int set_mc_event_backtrace(char *buf, struct ras_mc_event *ev){ + char bt_buf[MAX_BACKTRACE_SIZE]; + + if(!buf || !ev) + return -1; + +- sprintf(bt_buf, "BACKTRACE= " \ ++ sprintf(bt_buf, "BACKTRACE=" \ + "timestamp=%s\n" \ + "error_count=%d\n" \ + "error_type=%s\n" \ +@@ -298,11 +280,6 @@ int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){ + goto mc_fail; + } + +- rc = commit_report_done(sockfd); +- if(rc < 0){ +- goto mc_fail; +- } +- + done = 1; + + mc_fail: +@@ -353,11 +330,6 @@ int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){ + goto aer_fail; + } + +- rc = commit_report_done(sockfd); +- if(rc < 0){ +- goto aer_fail; +- } +- + done = 1; + + aer_fail: +@@ -408,11 +380,6 @@ int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ + goto mce_fail; + } + +- rc = commit_report_done(sockfd); +- if(rc < 0){ +- goto mce_fail; +- } +- + done = 1; + + mce_fail: +-- +1.7.1 + diff --git a/SOURCES/0032-rasdaemon-handle-failures-of-snprintf.patch b/SOURCES/0032-rasdaemon-handle-failures-of-snprintf.patch new file mode 100644 index 0000000..d78e521 --- /dev/null +++ b/SOURCES/0032-rasdaemon-handle-failures-of-snprintf.patch @@ -0,0 +1,43 @@ +From 59f6c44864f914a189cb924dd8fea14cc314bf3f Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Mon, 23 Jun 2014 15:43:41 -0400 +Subject: [PATCH 1/2] rasdaemon: handle failures of snprintf() + +Florian Weimer found that in bitfield_msg() the return value of +snprintf() is used to calculate length ignoring that it can return a +negative number. This patch makes bitfield_msg() to stop writing in such +case. + +Reference: https://bugzilla.redhat.com/show_bug.cgi?id=1035741 + +Reported-by: Florian Weimer <fweimer@redhat.com> +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +--- + bitfield.c | 4 ++++ + 1 files changed, 4 insertions(+), 0 deletions(-) + +diff --git a/bitfield.c b/bitfield.c +index b2895b4..1690f15 100644 +--- a/bitfield.c ++++ b/bitfield.c +@@ -41,6 +41,8 @@ unsigned bitfield_msg(char *buf, size_t len, const char **bitarray, + if (status & (1 << (i + bit_offset))) { + if (p != buf) { + n = snprintf(p, len, ", "); ++ if (n < 0) ++ break; + len -= n; + p += n; + } +@@ -48,6 +50,8 @@ unsigned bitfield_msg(char *buf, size_t len, const char **bitarray, + n = snprintf(p, len, "BIT%d", i + bit_offset); + else + n = snprintf(p, len, "%s", bitarray[i]); ++ if (n < 0) ++ break; + len -= n; + p += n; + } +-- +1.7.1 + diff --git a/SOURCES/0033-rasdaemon-correct-range-while-parsing-top-middle-and.patch b/SOURCES/0033-rasdaemon-correct-range-while-parsing-top-middle-and.patch new file mode 100644 index 0000000..eadb858 --- /dev/null +++ b/SOURCES/0033-rasdaemon-correct-range-while-parsing-top-middle-and.patch @@ -0,0 +1,46 @@ +From 5ba31285710e85c7d3688e536cd54180321964e4 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Mon, 23 Jun 2014 16:31:50 -0400 +Subject: [PATCH 2/2] rasdaemon: correct range while parsing top, middle and lower layers + +{top,middle,lower}_layer are signed char, therefore will never be 255. + +Reference: https://bugzilla.redhat.com/show_bug.cgi?id=1035746 + +Reported-by: Florian Weimer <fweimer@redhat.com> +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +--- + ras-mc-handler.c | 14 +++----------- + 1 file changed, 3 insertions(+), 11 deletions(-) + +--- upstream.orig/ras-mc-handler.c 2014-06-26 16:09:30.000000000 -0400 ++++ upstream/ras-mc-handler.c 2014-06-26 16:09:32.000000000 -0400 +@@ -120,25 +120,17 @@ if (pevent_get_field_val(s, event, "mc_ + if (pevent_get_field_val(s, event, "top_layer", record, &val, 1) < 0) + goto parse_error; + parsed_fields++; ++ ev.top_layer = (signed char) val; + +- ev.top_layer = (int) val; + if (pevent_get_field_val(s, event, "middle_layer", record, &val, 1) < 0) + goto parse_error; + parsed_fields++; ++ ev.middle_layer = (signed char) val; + +- ev.middle_layer = (int) val; + if (pevent_get_field_val(s, event, "lower_layer", record, &val, 1) < 0) + goto parse_error; + parsed_fields++; +- +- ev.lower_layer = (int) val; +- +- if (ev.top_layer == 255) +- ev.top_layer = -1; +- if (ev.middle_layer == 255) +- ev.middle_layer = -1; +- if (ev.lower_layer == 255) +- ev.lower_layer = -1; ++ ev.lower_layer = (signed char) val; + + if (ev.top_layer >= 0 || ev.middle_layer >= 0 || ev.lower_layer >= 0) { + if (ev.lower_layer >= 0) diff --git a/SOURCES/0034-rasdaemon-enable-recording-by-default.patch b/SOURCES/0034-rasdaemon-enable-recording-by-default.patch new file mode 100644 index 0000000..6a4c6b7 --- /dev/null +++ b/SOURCES/0034-rasdaemon-enable-recording-by-default.patch @@ -0,0 +1,17 @@ +--- + misc/rasdaemon.service.in | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- upstream.orig/misc/rasdaemon.service.in 2014-06-04 14:25:13.000000000 -0400 ++++ upstream/misc/rasdaemon.service.in 2014-07-08 14:37:26.421395520 -0400 +@@ -3,7 +3,9 @@ Description=RAS daemon to log the RAS ev + After=syslog.target + + [Service] +-ExecStart=@sbindir@/rasdaemon -f ++ExecStart=@sbindir@/rasdaemon -f -r ++ExecStartPost=@sbindir@/rasdaemon --enable ++ExecStop=@sbindir@/rasdaemon --disable + Restart=on-abort + + [Install] diff --git a/SOURCES/0035-eMCA-support.patch b/SOURCES/0035-eMCA-support.patch new file mode 100644 index 0000000..8d952ef --- /dev/null +++ b/SOURCES/0035-eMCA-support.patch @@ -0,0 +1,807 @@ +commit 38d48ed48f9d0baa20786d98abe2b4085fca7d5d +Author: Luck, Tony <tony.luck@intel.com> +Date: Mon Aug 4 13:29:01 2014 -0700 + + rasdaemon: Add support for extlog trace events + + Linux kernel 3.17 includes a new trace event to pick up extended + error logs produced by BIOS in the Common Platform Error Record + format described in appendix N of the UEFI standard. This patch + adds support to collect that information and log it both in + readable ASCII and into the sqlite3 database that rasdaemon + uses to store all error information. In addition ras-mc-ctl + is updated to query that database for both detailed and summary + reports. + + Big thanks to Aristeu for pretty much all the sqlite3 pieces, + plus testing and fixing miscellaneous issues elsewhere. + + Signed-off-by: Tony Luck <tony.luck@intel.com> + Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> + +diff --git a/Makefile.am b/Makefile.am +index 0fa615f..117c970 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -30,13 +30,17 @@ if WITH_MCE + mce-intel-dunnington.c mce-intel-tulsa.c \ + mce-intel-sb.c mce-intel-ivb.c + endif ++if WITH_EXTLOG ++ rasdaemon_SOURCES += ras-extlog-handler.c ++endif + if WITH_ABRT_REPORT + rasdaemon_SOURCES += ras-report.c + endif + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a + + include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ +- ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h ++ ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ ++ ras-extlog-handler.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index 64a5b13..9495491 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -53,6 +53,15 @@ AS_IF([test "x$enable_mce" = "xyes"], [ + ]) + AM_CONDITIONAL([WITH_MCE], [test x$enable_mce = xyes]) + ++AC_ARG_ENABLE([extlog], ++ AS_HELP_STRING([--enable-extlog], [enable EXTLOG events (currently experimental)])) ++ ++AS_IF([test "x$enable_extlog" = "xyes"], [ ++ AC_DEFINE(HAVE_EXTLOG,1,"have EXTLOG events collect") ++ AC_SUBST([WITH_EXTLOG]) ++]) ++AM_CONDITIONAL([WITH_EXTLOG], [test x$enable_extlog = xyes]) ++ + AC_ARG_ENABLE([abrt_report], + AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)])) + +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index 50526af..bb7c0b9 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -70,7 +70,7 @@ int ras_aer_event_handler(struct trace_seq *s, + */ + + if (ras->use_uptime) +- now = record->ts/1000000000L + ras->uptime_diff; ++ now = record->ts/user_hz + ras->uptime_diff; + else + now = time(NULL); + +diff --git a/ras-events.c b/ras-events.c +index ecbbd3a..0be7c3f 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -30,6 +30,7 @@ + #include "ras-mc-handler.h" + #include "ras-aer-handler.h" + #include "ras-mce-handler.h" ++#include "ras-extlog-handler.h" + #include "ras-record.h" + #include "ras-logger.h" + +@@ -203,6 +204,10 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "mce", "mce_record", enable); + #endif + ++#ifdef HAVE_EXTLOG ++ rc |= __toggle_ras_mc_event(ras, "ras", "extlog_mem_event", enable); ++#endif ++ + free_ras: + free(ras); + return rc; +@@ -688,6 +693,19 @@ int handle_ras_events(int record_events) + "mce", "mce_record"); + } + #endif ++ ++#ifdef HAVE_EXTLOG ++ rc = add_event_handler(ras, pevent, page_size, "ras", "extlog_mem_event", ++ ras_extlog_mem_event_handler); ++ if (!rc) { ++ /* tell kernel we are listening, so don't printk to console */ ++ (void)open("/sys/kernel/debug/ras/daemon_active", 0); ++ num_events++; ++ } else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "ras", "aer_event"); ++#endif ++ + if (!num_events) { + log(ALL, LOG_INFO, + "Failed to trace all supported RAS events. Aborting.\n"); +diff --git a/ras-extlog-handler.c b/ras-extlog-handler.c +new file mode 100644 +index 0000000..5fd3580 +--- /dev/null ++++ b/ras-extlog-handler.c +@@ -0,0 +1,246 @@ ++/* ++ * Copyright (C) 2014 Tony Luck <tony.luck@intel.com> ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++#include <ctype.h> ++#include <errno.h> ++#include <stdio.h> ++#include <stdlib.h> ++#include <string.h> ++#include <unistd.h> ++#include <stdint.h> ++#include "libtrace/kbuffer.h" ++#include "ras-extlog-handler.h" ++#include "ras-record.h" ++#include "ras-logger.h" ++#include "ras-report.h" ++ ++static char *err_type(int etype) ++{ ++ switch (etype) { ++ case 0: return "unknown"; ++ case 1: return "no error"; ++ case 2: return "single-bit ECC"; ++ case 3: return "multi-bit ECC"; ++ case 4: return "single-symbol chipkill ECC"; ++ case 5: return "multi-symbol chipkill ECC"; ++ case 6: return "master abort"; ++ case 7: return "target abort"; ++ case 8: return "parity error"; ++ case 9: return "watchdog timeout"; ++ case 10: return "invalid address"; ++ case 11: return "mirror Broken"; ++ case 12: return "memory sparing"; ++ case 13: return "scrub corrected error"; ++ case 14: return "scrub uncorrected error"; ++ case 15: return "physical memory map-out event"; ++ } ++ return "unknown-type"; ++} ++ ++static char *err_severity(int severity) ++{ ++ switch (severity) { ++ case 0: return "recoverable"; ++ case 1: return "fatal"; ++ case 2: return "corrected"; ++ case 3: return "informational"; ++ } ++ return "unknown-severity"; ++} ++ ++static unsigned long long err_mask(int lsb) ++{ ++ if (lsb == 0xff) ++ return ~0ull; ++ return ~((1ull << lsb) - 1); ++} ++ ++#define CPER_MEM_VALID_NODE 0x0008 ++#define CPER_MEM_VALID_CARD 0x0010 ++#define CPER_MEM_VALID_MODULE 0x0020 ++#define CPER_MEM_VALID_BANK 0x0040 ++#define CPER_MEM_VALID_DEVICE 0x0080 ++#define CPER_MEM_VALID_ROW 0x0100 ++#define CPER_MEM_VALID_COLUMN 0x0200 ++#define CPER_MEM_VALID_BIT_POSITION 0x0400 ++#define CPER_MEM_VALID_REQUESTOR_ID 0x0800 ++#define CPER_MEM_VALID_RESPONDER_ID 0x1000 ++#define CPER_MEM_VALID_TARGET_ID 0x2000 ++#define CPER_MEM_VALID_RANK_NUMBER 0x8000 ++#define CPER_MEM_VALID_CARD_HANDLE 0x10000 ++#define CPER_MEM_VALID_MODULE_HANDLE 0x20000 ++ ++struct cper_mem_err_compact { ++ unsigned long long validation_bits; ++ unsigned short node; ++ unsigned short card; ++ unsigned short module; ++ unsigned short bank; ++ unsigned short device; ++ unsigned short row; ++ unsigned short column; ++ unsigned short bit_pos; ++ unsigned long long requestor_id; ++ unsigned long long responder_id; ++ unsigned long long target_id; ++ unsigned short rank; ++ unsigned short mem_array_handle; ++ unsigned short mem_dev_handle; ++}; ++ ++static char *err_cper_data(const char *c) ++{ ++ const struct cper_mem_err_compact *cpd = (struct cper_mem_err_compact *)c; ++ static char buf[256]; ++ char *p = buf; ++ ++ if (cpd->validation_bits == 0) ++ return ""; ++ p += sprintf(p, " ("); ++ if (cpd->validation_bits & CPER_MEM_VALID_NODE) ++ p += sprintf(p, "node: %d ", cpd->node); ++ if (cpd->validation_bits & CPER_MEM_VALID_CARD) ++ p += sprintf(p, "card: %d ", cpd->card); ++ if (cpd->validation_bits & CPER_MEM_VALID_MODULE) ++ p += sprintf(p, "module: %d ", cpd->module); ++ if (cpd->validation_bits & CPER_MEM_VALID_BANK) ++ p += sprintf(p, "bank: %d ", cpd->bank); ++ if (cpd->validation_bits & CPER_MEM_VALID_DEVICE) ++ p += sprintf(p, "device: %d ", cpd->device); ++ if (cpd->validation_bits & CPER_MEM_VALID_ROW) ++ p += sprintf(p, "row: %d ", cpd->row); ++ if (cpd->validation_bits & CPER_MEM_VALID_COLUMN) ++ p += sprintf(p, "column: %d ", cpd->column); ++ if (cpd->validation_bits & CPER_MEM_VALID_BIT_POSITION) ++ p += sprintf(p, "bit_pos: %d ", cpd->bit_pos); ++ if (cpd->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) ++ p += sprintf(p, "req_id: 0x%llx ", cpd->requestor_id); ++ if (cpd->validation_bits & CPER_MEM_VALID_RESPONDER_ID) ++ p += sprintf(p, "resp_id: 0x%llx ", cpd->responder_id); ++ if (cpd->validation_bits & CPER_MEM_VALID_TARGET_ID) ++ p += sprintf(p, "tgt_id: 0x%llx ", cpd->target_id); ++ if (cpd->validation_bits & CPER_MEM_VALID_RANK_NUMBER) ++ p += sprintf(p, "rank: %d ", cpd->rank); ++ if (cpd->validation_bits & CPER_MEM_VALID_CARD_HANDLE) ++ p += sprintf(p, "card_handle: %d ", cpd->mem_array_handle); ++ if (cpd->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) ++ p += sprintf(p, "module_handle: %d ", cpd->mem_dev_handle); ++ p += sprintf(p-1, ")"); ++ ++ return buf; ++} ++ ++static char *uuid_le(const char *uu) ++{ ++ static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; ++ char *p = uuid; ++ int i; ++ static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; ++ ++ for (i = 0; i < 16; i++) { ++ p += sprintf(p, "%.2x", uu[le[i]]); ++ switch (i) { ++ case 3: ++ case 5: ++ case 7: ++ case 9: ++ *p++ = '-'; ++ break; ++ } ++ } ++ ++ *p = 0; ++ ++ return uuid; ++} ++ ++ ++static void report_extlog_mem_event(struct ras_events *ras, ++ struct pevent_record *record, ++ struct trace_seq *s, ++ struct ras_extlog_event *ev) ++{ ++ trace_seq_printf(s, "%d %s error: %s physical addr: 0x%llx mask: 0x%llx%s %s %s", ++ ev->error_seq, err_severity(ev->severity), ++ err_type(ev->etype), ev->address, ++ err_mask(ev->pa_mask_lsb), ++ err_cper_data(ev->cper_data), ++ ev->fru_text, ++ uuid_le(ev->fru_id)); ++} ++ ++int ras_extlog_mem_event_handler(struct trace_seq *s, ++ struct pevent_record *record, ++ struct event_format *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ time_t now; ++ struct tm *tm; ++ struct ras_extlog_event ev; ++ ++ /* ++ * Newer kernels (3.10-rc1 or upper) provide an uptime clock. ++ * On previous kernels, the way to properly generate an event would ++ * be to inject a fake one, measure its timestamp and diff it against ++ * gettimeofday. We won't do it here. Instead, let's use uptime, ++ * falling-back to the event report's time, if "uptime" clock is ++ * not available (legacy kernels). ++ */ ++ ++ if (ras->use_uptime) ++ now = record->ts/user_hz + ras->uptime_diff; ++ else ++ now = time(NULL); ++ ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ trace_seq_printf(s, "%s ", ev.timestamp); ++ ++ if (pevent_get_field_val(s, event, "etype", record, &val, 1) < 0) ++ return -1; ++ ev.etype = val; ++ if (pevent_get_field_val(s, event, "err_seq", record, &val, 1) < 0) ++ return -1; ++ ev.error_seq = val; ++ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) ++ return -1; ++ ev.severity = val; ++ if (pevent_get_field_val(s, event, "pa", record, &val, 1) < 0) ++ return -1; ++ ev.address = val; ++ if (pevent_get_field_val(s, event, "pa_mask_lsb", record, &val, 1) < 0) ++ return -1; ++ ev.pa_mask_lsb = val; ++ ++ ev.cper_data = pevent_get_field_raw(s, event, "data", ++ record, &len, 1); ++ ev.cper_data_length = len; ++ ev.fru_text = pevent_get_field_raw(s, event, "fru_text", ++ record, &len, 1); ++ ev.fru_id = pevent_get_field_raw(s, event, "fru_id", ++ record, &len, 1); ++ ++ report_extlog_mem_event(ras, record, s, &ev); ++ ++ ras_store_extlog_mem_record(ras, &ev); ++ ++ return 0; ++} +diff --git a/ras-extlog-handler.h b/ras-extlog-handler.h +new file mode 100644 +index 0000000..54e8cec +--- /dev/null ++++ b/ras-extlog-handler.h +@@ -0,0 +1,31 @@ ++/* ++ * Copyright (C) 2014 Tony Luck <tony.luck@intel.com> ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++ ++#ifndef __RAS_EXTLOG_HANDLER_H ++#define __RAS_EXTLOG_HANDLER_H ++ ++#include <stdint.h> ++ ++#include "ras-events.h" ++#include "libtrace/event-parse.h" ++ ++extern int ras_extlog_mem_event_handler(struct trace_seq *s, ++ struct pevent_record *record, ++ struct event_format *event, void *context); ++ ++#endif +diff --git a/ras-mc-handler.c b/ras-mc-handler.c +index ffb3805..704a41c 100644 +--- a/ras-mc-handler.c ++++ b/ras-mc-handler.c +@@ -47,7 +47,7 @@ int ras_mc_event_handler(struct trace_seq *s, + */ + + if (ras->use_uptime) +- now = record->ts/1000000000L + ras->uptime_diff; ++ now = record->ts/user_hz + ras->uptime_diff; + else + now = time(NULL); + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 1431049..a1d0b5d 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -237,7 +237,7 @@ static void report_mce_event(struct ras_events *ras, + */ + + if (ras->use_uptime) +- now = record->ts/1000000000L + ras->uptime_diff; ++ now = record->ts/user_hz + ras->uptime_diff; + else + now = time(NULL); + +diff --git a/ras-record.c b/ras-record.c +index e5150ad..3dc4493 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -157,6 +157,57 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) + } + #endif + ++#ifdef HAVE_EXTLOG ++static const struct db_fields extlog_event_fields[] = { ++ { .name="id", .type="INTEGER PRIMARY KEY" }, ++ { .name="timestamp", .type="TEXT" }, ++ { .name="etype", .type="INTEGER" }, ++ { .name="error_count", .type="INTEGER" }, ++ { .name="severity", .type="INTEGER" }, ++ { .name="address", .type="INTEGER" }, ++ { .name="fru_id", .type="BLOB" }, ++ { .name="fru_text", .type="TEXT" }, ++ { .name="cper_data", .type="BLOB" }, ++}; ++ ++static const struct db_table_descriptor extlog_event_tab = { ++ .name = "extlog_event", ++ .fields = extlog_event_fields, ++ .num_fields = ARRAY_SIZE(extlog_event_fields), ++}; ++ ++int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_extlog_record) ++ return 0; ++ log(TERM, LOG_INFO, "extlog_record store: %p\n", priv->stmt_extlog_record); ++ ++ sqlite3_bind_text (priv->stmt_extlog_record, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_int (priv->stmt_extlog_record, 2, ev->etype); ++ sqlite3_bind_int (priv->stmt_extlog_record, 3, ev->error_seq); ++ sqlite3_bind_int (priv->stmt_extlog_record, 4, ev->severity); ++ sqlite3_bind_int64 (priv->stmt_extlog_record, 5, ev->address); ++ sqlite3_bind_blob (priv->stmt_extlog_record, 6, ev->fru_id, 16, NULL); ++ sqlite3_bind_text (priv->stmt_extlog_record, 7, ev->fru_text, -1, NULL); ++ sqlite3_bind_blob (priv->stmt_extlog_record, 8, ev->cper_data, ev->cper_data_length, NULL); ++ ++ rc = sqlite3_step(priv->stmt_extlog_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do extlog_mem_record step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_extlog_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset extlog_mem_record on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif + + /* + * Table and functions to handle mce:mce_record +@@ -385,6 +436,13 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + &aer_event_tab); + #endif + ++#ifdef HAVE_EXTLOG ++ rc = ras_mc_create_table(priv, &extlog_event_tab); ++ if (rc == SQLITE_OK) ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_extlog_record, ++ &extlog_event_tab); ++#endif ++ + #ifdef HAVE_MCE + rc = ras_mc_create_table(priv, &mce_record_tab); + if (rc == SQLITE_OK) +diff --git a/ras-record.h b/ras-record.h +index 6f146a8..5d84297 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -19,8 +19,11 @@ + #ifndef __RAS_RECORD_H + #define __RAS_RECORD_H + ++#include <stdint.h> + #include "config.h" + ++extern long user_hz; ++ + struct ras_events *ras; + + struct ras_mc_event { +@@ -40,8 +43,22 @@ struct ras_aer_event { + const char *msg; + }; + ++struct ras_extlog_event { ++ char timestamp[64]; ++ int32_t error_seq; ++ int8_t etype; ++ int8_t severity; ++ unsigned long long address; ++ int8_t pa_mask_lsb; ++ const char *fru_id; ++ const char *fru_text; ++ const char *cper_data; ++ unsigned short cper_data_length; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; ++struct ras_extlog_event; + struct mce_event; + + #ifdef HAVE_SQLITE3 +@@ -57,18 +74,23 @@ struct sqlite3_priv { + #ifdef HAVE_MCE + sqlite3_stmt *stmt_mce_record; + #endif ++#ifdef HAVE_EXTLOG ++ sqlite3_stmt *stmt_extlog_record; ++#endif + }; + + int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras); + int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev); + int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev); + int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev); ++int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; + static inline int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; }; + static inline int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; }; + static inline int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) { return 0; }; ++static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; }; + + #endif + +diff --git a/rasdaemon.c b/rasdaemon.c +index 85ac2d4..41022ef 100644 +--- a/rasdaemon.c ++++ b/rasdaemon.c +@@ -68,6 +68,8 @@ static error_t parse_opt(int k, char *arg, struct argp_state *state) + return 0; + } + ++long user_hz; ++ + int main(int argc, char *argv[]) + { + struct arguments args; +@@ -91,6 +93,8 @@ int main(int argc, char *argv[]) + }; + memset (&args, 0, sizeof(args)); + ++ user_hz = sysconf(_SC_CLK_TCK); ++ + argp_parse(&argp, argc, argv, 0, &idx, &args); + + if (idx < 0) { +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index e9f9c59..110262f 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -842,11 +842,141 @@ sub find_prog + return ""; + } + ++sub get_extlog_type ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 15) { ++ return "unknown-type"; ++ } ++ ++ @types = ("unknown", ++ "no error", ++ "single-bit ECC", ++ "multi-bit ECC", ++ "single-symbol chipkill ECC", ++ "multi-symbol chipkill ECC", ++ "master abort", ++ "target abort", ++ "parity error", ++ "watchdog timeout", ++ "invalid address", ++ "mirror Broken", ++ "memory sparing", ++ "scrub corrected error", ++ "scrub uncorrected error", ++ "physical memory map-out event", ++ "unknown-type"); ++ return $types[$_[0]]; ++} ++ ++sub get_extlog_severity ++{ ++ my @sev; ++ ++ if ($_[0] < 0 || $_[0] > 3) { ++ return "unknown-severity"; ++ } ++ ++ @sev = ("recoverable", ++ "fatal", ++ "corrected", ++ "informational", ++ "unknown-severity"); ++ return $sev[$_[0]]; ++} ++ ++use constant { ++ CPER_MEM_VALID_NODE => 0x0008, ++ CPER_MEM_VALID_CARD => 0x0010, ++ CPER_MEM_VALID_MODULE => 0x0020, ++ CPER_MEM_VALID_BANK => 0x0040, ++ CPER_MEM_VALID_DEVICE => 0x0080, ++ CPER_MEM_VALID_ROW => 0x0100, ++ CPER_MEM_VALID_COLUMN => 0x0200, ++ CPER_MEM_VALID_BIT_POSITION => 0x0400, ++ CPER_MEM_VALID_REQUESTOR_ID => 0x0800, ++ CPER_MEM_VALID_RESPONDER_ID => 0x1000, ++ CPER_MEM_VALID_TARGET_ID => 0x2000, ++ CPER_MEM_VALID_ERROR_TYPE => 0x4000, ++ CPER_MEM_VALID_RANK_NUMBER => 0x8000, ++ CPER_MEM_VALID_CARD_HANDLE => 0x10000, ++ CPER_MEM_VALID_MODULE_HANDLE => 0x20000, ++}; ++ ++sub get_cper_data_text ++{ ++ my $cper_data = $_[0]; ++ my ($validation_bits, $node, $card, $module, $bank, $device, $row, $column, $bit_pos, $requestor_id, $responder_id, $target_id, $rank, $mem_array_handle, $mem_dev_handle) = unpack 'QSSSSSSSSQQQSSS', $cper_data; ++ my @out; ++ ++ if ($validation_bits & CPER_MEM_VALID_NODE) { ++ push @out, (sprintf "node=%d", $node); ++ } ++ if ($validation_bits & CPER_MEM_VALID_CARD) { ++ push @out, (sprintf "card=%d", $card); ++ } ++ if ($validation_bits & CPER_MEM_VALID_MODULE) { ++ push @out, (sprintf "module=%d", $module); ++ } ++ if ($validation_bits & CPER_MEM_VALID_BANK) { ++ push @out, (sprintf "bank=%d", $bank); ++ } ++ if ($validation_bits & CPER_MEM_VALID_DEVICE) { ++ push @out, (sprintf "device=%d", $device); ++ } ++ if ($validation_bits & CPER_MEM_VALID_ROW) { ++ push @out, (sprintf "row=%d", $row); ++ } ++ if ($validation_bits & CPER_MEM_VALID_COLUMN) { ++ push @out, (sprintf "column=%d", $column); ++ } ++ if ($validation_bits & CPER_MEM_VALID_BIT_POSITION) { ++ push @out, (sprintf "bit_position=%d", $bit_pos); ++ } ++ if ($validation_bits & CPER_MEM_VALID_REQUESTOR_ID) { ++ push @out, (sprintf "0x%08x", $requestor_id); ++ } ++ if ($validation_bits & CPER_MEM_VALID_RESPONDER_ID) { ++ push @out, (sprintf "0x%08x", $responder_id); ++ } ++ if ($validation_bits & CPER_MEM_VALID_TARGET_ID) { ++ push @out, (sprintf "0x%08x", $target_id); ++ } ++ if ($validation_bits & CPER_MEM_VALID_RANK_NUMBER) { ++ push @out, (sprintf "rank=%d", $rank); ++ } ++ if ($validation_bits & CPER_MEM_VALID_CARD_HANDLE) { ++ push @out, (sprintf "mem_array_handle=%d", $mem_array_handle); ++ } ++ if ($validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { ++ push @out, (sprintf "mem_dev_handle=%d", $mem_dev_handle); ++ } ++ ++ return join (", ", @out); ++} ++ ++sub get_uuid_le ++{ ++ my $out = ""; ++ my @bytes = unpack "C*", $_[0]; ++ my @le16_table = (3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15); ++ ++ for (my $i = 0; $i < 16; $i++) { ++ $out .= sprintf "%.2x", $bytes[$le16_table[$i]]; ++ if ($i == 3 or $i == 5 or $i == 7 or $i == 9) { ++ $out .= "-"; ++ } ++ } ++ return $out; ++} ++ + sub summary + { + require DBI; + my ($query, $query_handle, $out); + my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg); ++ my ($etype, $severity, $etype_string, $severity_string); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -882,6 +1012,24 @@ sub summary + } + $query_handle->finish; + ++ # extlog errors ++ $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($etype, $severity, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $etype_string = get_extlog_type($etype); ++ $severity_string = get_extlog_severity($severity); ++ $out .= "\t$count $etype_string $severity_string errors\n"; ++ } ++ if ($out ne "") { ++ print "Extlog records summary:\n$out"; ++ } else { ++ print "No Extlog errors.\n"; ++ } ++ $query_handle->finish; ++ + # MCE mce_record errors + $query = "select error_msg, count(*) from mce_record group by error_msg"; + $query_handle = $dbh->prepare($query); +@@ -906,6 +1054,7 @@ sub errors + require DBI; + my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); + my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location); ++ my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -945,6 +1094,31 @@ sub errors + } + $query_handle->finish; + ++ # Extlog errors ++ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $etype_string = get_extlog_type($etype); ++ $severity_string = get_extlog_severity($severity); ++ $out .= "$id $timestamp error: "; ++ $out .= "type=$etype_string, "; ++ $out .= "severity=$severity_string, "; ++ $out .= sprintf "address=0x%08x, ", $addr; ++ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id); ++ $out .= "fru_text='$fru_text', "; ++ $out .= get_cper_data_text($cper_data) if ($cper_data); ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "Extlog events:\n$out\n"; ++ } else { ++ print "No Extlog errors.\n\n"; ++ } ++ $query_handle->finish; ++ + # MCE mce_record errors + $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; + $query_handle = $dbh->prepare($query); diff --git a/SOURCES/0036-rasdaemon-fix-some-errors-in-sqlite.patch b/SOURCES/0036-rasdaemon-fix-some-errors-in-sqlite.patch new file mode 100644 index 0000000..35c3636 --- /dev/null +++ b/SOURCES/0036-rasdaemon-fix-some-errors-in-sqlite.patch @@ -0,0 +1,37 @@ +commit d3d336471119f16368e40b68643d9dd928be5385 +Author: Luck, Tony <tony.luck@intel.com> +Date: Mon Apr 7 12:23:25 2014 -0700 + + rasdaemon: fix some typos and cut/paste errors in sqlite bits + + aer event has the error_type as field 2 and msg as field 3 - but the calls + the sqlite3_bind_text use 3 and 4. + + mce event forgot to declare the "mcastatus_msg" + + Signed-off-by: Tony Luck <tony.luck@intel.com> + Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> + +diff --git a/ras-record.c b/ras-record.c +index daa3cb1..e602edb 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -139,8 +139,8 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) + log(TERM, LOG_INFO, "aer_event store: %p\n", priv->stmt_aer_event); + + sqlite3_bind_text(priv->stmt_aer_event, 1, ev->timestamp, -1, NULL); +- sqlite3_bind_text(priv->stmt_aer_event, 3, ev->error_type, -1, NULL); +- sqlite3_bind_text(priv->stmt_aer_event, 4, ev->msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_aer_event, 2, ev->error_type, -1, NULL); ++ sqlite3_bind_text(priv->stmt_aer_event, 3, ev->msg, -1, NULL); + + rc = sqlite3_step(priv->stmt_aer_event); + if (rc != SQLITE_OK && rc != SQLITE_DONE) +@@ -189,6 +189,7 @@ static const struct db_fields mce_record_fields[] = { + { .name="error_msg", .type="TEXT" }, + { .name="mcgstatus_msg", .type="TEXT" }, + { .name="mcistatus_msg", .type="TEXT" }, // 20 ++ { .name="mcastatus_msg", .type="TEXT" }, + { .name="user_action", .type="TEXT" }, + { .name="mc_location", .type="TEXT" }, + }; diff --git a/SOURCES/0037-rasdaemon-sqlite-truncates-some-MCE-fields-to-32-bit.patch b/SOURCES/0037-rasdaemon-sqlite-truncates-some-MCE-fields-to-32-bit.patch new file mode 100644 index 0000000..2b30c4f --- /dev/null +++ b/SOURCES/0037-rasdaemon-sqlite-truncates-some-MCE-fields-to-32-bit.patch @@ -0,0 +1,73 @@ +commit 52e60e3050105a55e1ff2382979d5f370f398200 +Author: Luck, Tony <tony.luck@intel.com> +Date: Mon Apr 7 11:27:47 2014 -0700 + + rasdaemon: sqlite truncates some MCE fields to 32-bit + + The sqlite3_bind_int() function takes an "int" as the argument value to + save to the database. But some fields are wider than 32-bits. Use + sqlite3_bind_int64() for the fields where we know values can exceed + 4G. + + Before: + + # ./rasdaemon/util/ras-mc-ctl --errors + ... + MCE events: + 1 2014-04-04 08:50:32 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x00010090, addr=0x35fcb9c0, misc=0x5026a686, walltime=0x5342e4f9, cpu=0x0000000e, cpuid=0x000306f1, apicid=0x00000020, socketid=0x00000001, bank=0x00000008 + 2 2014-04-04 08:50:35 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x00010090, addr=0x4187adc0, misc=0x4274f486, walltime=0x5342e4fc, cpu=0x0000000e, cpuid=0x000306f1, apicid=0x00000020, socketid=0x00000001, bank=0x00000007 + 3 2014-04-04 08:50:37 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x00010090, addr=0x52efc600, misc=0x50028286, walltime=0x5342e4fd, cpu=0x0000000e, cpuid=0x000306f1, apicid=0x00000020, socketid=0x00000001, bank=0x00000008 + + After: + ./rasdaemon/util/ras-mc-ctl --errors + ... + 1 2014-04-04 09:00:07 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x8c00004000010090, addr=0x45340a180, misc=0x140686886, walltime=0x5342e736, cpuid=0x000306f1, bank=0x00000008 + 2 2014-04-04 09:00:08 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x8c00004000010090, addr=0x44d6e4780, misc=0x15060e086, walltime=0x5342e737, cpuid=0x000306f1, bank=0x00000007 + 3 2014-04-04 09:00:10 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x8c00004000010090, addr=0x44cb64640, misc=0x140505086, walltime=0x5342e739, cpuid=0x000306f1, bank=0x00000008 + + Signed-off-by: Tony Luck <tony.luck@intel.com> + Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> + +diff --git a/ras-record.c b/ras-record.c +index e602edb..e5150ad 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -209,22 +209,22 @@ int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) + return 0; + log(TERM, LOG_INFO, "mce_record store: %p\n", priv->stmt_mce_record); + +- sqlite3_bind_text(priv->stmt_mce_record, 1, ev->timestamp, -1, NULL); +- sqlite3_bind_int (priv->stmt_mce_record, 2, ev->mcgcap); +- sqlite3_bind_int (priv->stmt_mce_record, 3, ev->mcgstatus); +- sqlite3_bind_int (priv->stmt_mce_record, 4, ev->status); +- sqlite3_bind_int (priv->stmt_mce_record, 5, ev->addr); +- sqlite3_bind_int (priv->stmt_mce_record, 6, ev->misc); +- sqlite3_bind_int (priv->stmt_mce_record, 7, ev->ip); +- sqlite3_bind_int (priv->stmt_mce_record, 8, ev->tsc); +- sqlite3_bind_int (priv->stmt_mce_record, 9, ev->walltime); +- sqlite3_bind_int (priv->stmt_mce_record, 10, ev->cpu); +- sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpuid); +- sqlite3_bind_int (priv->stmt_mce_record, 12, ev->apicid); +- sqlite3_bind_int (priv->stmt_mce_record, 13, ev->socketid); +- sqlite3_bind_int (priv->stmt_mce_record, 14, ev->cs); +- sqlite3_bind_int (priv->stmt_mce_record, 15, ev->bank); +- sqlite3_bind_int (priv->stmt_mce_record, 16, ev->cpuvendor); ++ sqlite3_bind_text (priv->stmt_mce_record, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_int (priv->stmt_mce_record, 2, ev->mcgcap); ++ sqlite3_bind_int (priv->stmt_mce_record, 3, ev->mcgstatus); ++ sqlite3_bind_int64 (priv->stmt_mce_record, 4, ev->status); ++ sqlite3_bind_int64 (priv->stmt_mce_record, 5, ev->addr); ++ sqlite3_bind_int64 (priv->stmt_mce_record, 6, ev->misc); ++ sqlite3_bind_int64 (priv->stmt_mce_record, 7, ev->ip); ++ sqlite3_bind_int64 (priv->stmt_mce_record, 8, ev->tsc); ++ sqlite3_bind_int64 (priv->stmt_mce_record, 9, ev->walltime); ++ sqlite3_bind_int (priv->stmt_mce_record, 10, ev->cpu); ++ sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpuid); ++ sqlite3_bind_int (priv->stmt_mce_record, 12, ev->apicid); ++ sqlite3_bind_int (priv->stmt_mce_record, 13, ev->socketid); ++ sqlite3_bind_int (priv->stmt_mce_record, 14, ev->cs); ++ sqlite3_bind_int (priv->stmt_mce_record, 15, ev->bank); ++ sqlite3_bind_int (priv->stmt_mce_record, 16, ev->cpuvendor); + + sqlite3_bind_text(priv->stmt_mce_record, 17, ev->bank_name, -1, NULL); + sqlite3_bind_text(priv->stmt_mce_record, 18, ev->error_msg, -1, NULL); diff --git a/SOURCES/0038-rasdaemon-fix-mce-numfield-decoded-error.patch b/SOURCES/0038-rasdaemon-fix-mce-numfield-decoded-error.patch new file mode 100644 index 0000000..6e45309 --- /dev/null +++ b/SOURCES/0038-rasdaemon-fix-mce-numfield-decoded-error.patch @@ -0,0 +1,44 @@ +commit f20a366a9b7a32a1be6fc89e7546cc2b4cb690bf +Author: Xie XiuQi <xiexiuqi@huawei.com> +Date: Thu May 8 20:07:19 2014 +0800 + + rasdaemon: fix mce numfield decoded error + + Some fields are missing in mce decode information, as below: + ... + rasdaemon: register inserted at db + <...>-31568 [000] 4023.214080: mce_record: + 2014-05-07 15:51:16 +0800 bank=2, status= bd000000000000c0, MEMORY + CONTROLLER MS_CHANNEL0_ERR Transaction: Memory scrubbing error %s: %Lu + %s: %Lx + %s: %Lx + %s: %Lu + %s: %Lu + %s: %Lx + , mci=Uncorrected_error Error_enabled SRAO, n_errors=0 channel=0, + dimm=0, cpu_type= Intel Xeon 5500 series / Core i3/5/7 + ("Nehalem/Westmere"), cpu= 0, socketid= 0, ip= 1eadbabe (INEXACT), cs= + 73, misc= 8c, addr= 62b000, mcgstatus= 5 RIPV MCIP, mcgcap= 1c09, + apicid= 0 + + "f->name" & "v" are missed to print in decode_numfield(), so fix it. + + Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com> + Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> + +diff --git a/bitfield.c b/bitfield.c +index b2895b4..07795a9 100644 +--- a/bitfield.c ++++ b/bitfield.c +@@ -92,8 +92,9 @@ void decode_numfield(struct mce_event *e, uint64_t status, + uint64_t mask = (1ULL << (f->end - f->start + 1)) - 1; + uint64_t v = (status >> f->start) & mask; + if (v > 0 || f->force) { +- mce_snprintf(e->error_msg, "%%s: %s\n", +- f->fmt ? f->fmt : "%Lu"); ++ char fmt[32] = {0}; ++ snprintf(fmt, 32, "%%s: %s\n", f->fmt ? f->fmt : "%Lu"); ++ mce_snprintf(e->error_msg, fmt, f->name, v); + } + } + } diff --git a/SOURCES/0039-rasdaemon-do-not-assume-dimmX-directories-will-be-pr.patch b/SOURCES/0039-rasdaemon-do-not-assume-dimmX-directories-will-be-pr.patch new file mode 100644 index 0000000..c1baea8 --- /dev/null +++ b/SOURCES/0039-rasdaemon-do-not-assume-dimmX-directories-will-be-pr.patch @@ -0,0 +1,84 @@ +From 7e79fa94dc6c294cd731c0c684b277dd4811c5db Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <aris@redhat.com> +Date: Fri, 15 Aug 2014 13:50:58 -0400 +Subject: [PATCH 3/4] rasdaemon: do not assume dimmX/ directories will be + present + +While finding the labels, size and location, ras-mc-ctl will search /sys for +the files and calculate the location. When it uses the location trying to map +back to files to print labels or write labels, it'll just assume dimm* +directories exist which is not correct while using drivers like amd64_edac. +This patch adds two new hashes to store the location and the label file path +so it can be used later. + +Signed-off-by: Aristeu Rozanski <aris@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com> +--- + util/ras-mc-ctl.in | 21 +++++++++++++-------- + 1 file changed, 13 insertions(+), 8 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 110262f..7b6d798 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -45,6 +45,8 @@ my %conf = (); + my %bus = (); + my %dimm_size = (); + my %dimm_node = (); ++my %dimm_label_file = (); ++my %dimm_location = (); + my %csrow_size = (); + my %rank_size = (); + my %csrow_ranks = (); +@@ -278,6 +280,9 @@ sub parse_dimm_nodes + my $str_loc = join(':', $mc, @pos); + $dimm_size{$str_loc} = $size; + $dimm_node{$str_loc} = $dimm; ++ $file =~ s/size/dimm_label/; ++ $dimm_label_file{$str_loc} = $file; ++ $dimm_location{$str_loc} = $location; + + return; + } +@@ -557,12 +562,14 @@ sub read_dimm_label + + my $dimm = $dimm_node{$pos}; + +- my $file = "$sysfs/mc$mc/dimm$dimm/dimm_label"; ++ my $dimm_label_file = $dimm_label_file{$pos}; + +- return ("$pos missing") unless -f $file; ++ my $location = $dimm_location{$pos}; + +- if (!open (LABEL, "$file")) { +- warn "Failed to open $file: $!\n"; ++ return ("label missing", "$pos missing") unless -f $dimm_label_file; ++ ++ if (!open (LABEL, "$dimm_label_file")) { ++ warn "Failed to open $dimm_label_file: $!\n"; + return ("Error"); + } + +@@ -570,7 +577,7 @@ sub read_dimm_label + + close (LABEL); + +- $pos = "mc$mc " . qx(cat $sysfs/mc$mc/dimm$dimm/dimm_location); ++ $pos = "mc$mc $location"; + + return ($label, $pos); + } +@@ -587,9 +594,7 @@ sub get_dimm_label_node + + return "" if (!defined($dimm_node{$pos})); + +- my $dimm = $dimm_node{$pos}; +- +- return "$sysfs/mc$mc/dimm$dimm/dimm_label"; ++ return "$dimm_label_file{$pos}"; + } + + +-- +1.8.3.1 + diff --git a/SOURCES/0040-rasdaemon-add-more-dell-labels.patch b/SOURCES/0040-rasdaemon-add-more-dell-labels.patch new file mode 100644 index 0000000..7cd76b8 --- /dev/null +++ b/SOURCES/0040-rasdaemon-add-more-dell-labels.patch @@ -0,0 +1,119 @@ +Hello, + +This patch adds labels for these Dell PowerEdge Servers: + +R610,R/T710, R220, R/T620, R720/xd, R730/xd, M520, M620 and M820. + +The current T610 (0F5XM3) mapping is incorrect. This patch fixes it. + +Reqest review and inclusion to git repo. + +Acked-by: Aristeu Rozanski <aris@redhat.com> +Signed-off-by: Charles Rose <charles.rose.linux@gmail.com> +--- +Changes in v2: +- Include T110 II, T20, R/T320, M420, R/T420, R/T630, FC620, FC420 +- Include additional model numbers for M820 and some 2-socket systems. +- Consolidate systems with similar maps. +--- + labels/dell | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 79 insertions(+), 7 deletions(-) + +diff --git a/labels/dell b/labels/dell +index e1a09a7..d7e797b 100644 +--- a/labels/dell ++++ b/labels/dell +@@ -9,12 +9,84 @@ + # + + Vendor: Dell Inc. ++#### 11G #### ++# 2-socket ++# PowerEdge R610 ++ Model: 0K399H, 0F0XJ6 ++ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2; ++ DIMM_A4: 0.1.0; DIMM_A5: 0.1.1; DIMM_A6: 0.1.2; + +- Model: 0F5XM3 +- DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2; DIMM_A4: 0.0.3; +- DIMM_A5: 0.1.0; DIMM_A6: 0.1.1; DIMM_A7: 0.1.2; DIMM_A8: 0.1.3; +- DIMM_A9: 0.2.0; DIMM_A10: 0.2.1; DIMM_A11: 0.2.2; DIMM_A12: 0.2.3; ++ DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2; ++ DIMM_B4: 1.1.0; DIMM_B5: 1.1.1; DIMM_B6: 1.1.2; + +- DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2; DIMM_B4: 1.0.3; +- DIMM_B5: 1.1.0; DIMM_B6: 1.1.1; DIMM_B7: 1.1.2; DIMM_B8: 1.1.3; +- DIMM_B9: 1.2.0; DIMM_B10: 1.2.1; DIMM_B11: 1.2.2; DIMM_B12: 1.2.3; ++# PowerEdge T710 R710 ++ Model: 01CTXG, 0N0H4P, 0MD99X, 0N047H, 0PV9DG ++ DIMM_A3: 0.0.0; DIMM_A2: 0.1.0; DIMM_A1: 0.2.0; ++ DIMM_A6: 0.0.1; DIMM_A5: 0.1.1; DIMM_A4: 0.2.1; ++ DIMM_A9: 0.0.2; DIMM_A8: 0.1.2; DIMM_A7: 0.2.2; ++ ++ DIMM_B3: 1.0.0; DIMM_B2: 1.1.0; DIMM_B1: 1.2.0; ++ DIMM_B6: 1.0.1; DIMM_B5: 1.1.1; DIMM_B4: 1.2.1; ++ DIMM_B9: 1.0.2; DIMM_B8: 1.1.2; DIMM_B7: 1.2.2; ++ ++#### 12/13G #### ++# 1-socket ++# PowerEdge R220 ++ Model: 081N4V ++ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; ++ DIMM_A3: 0.1.0; DIMM_A4: 0.1.1; ++ ++#PowerEdge T110 II, T20 ++ Model: 0PC2WT, 0PM2CW, 015TH9, 0MDHN4, 0VD5HY ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; ++ ++ DIMM_B1: 0.0.1; DIMM_B2: 0.1.1; ++ ++#PowerEdge R320 T320 ++ Model: 0YCV59, 0Y97HY, 07DKYR, 0VJ84C, 07MYHN, 04DMNN, 0W7H8C, 0K20G5, 0V719V, 0FDT3J ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; ++ DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1; ++ ++# 2-socket ++# PowerEdge R620/T620 R720/xd R730/xd T630 R730 R630 T620 M620, FC620 ++ Model: 0VWT90, 07NDJ2, 0F5XM3, 0PXXHP, 0X3D66, 061P35, 0H5J4J, 00W9X3, 0599V5, 0W9WXC, 0599V5, 0H21J3, 0CNCJW, 02CD1V, 0T5TFW, 0F5XM3, 0G1CNH, 05YV77, 0PDCCX, 093MW8, 0NJVT7 ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0; ++ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1; ++ DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2; ++ ++ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0; ++ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1; ++ DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2; ++ ++# PowerEdge M520 R420 T420 ++ Model: 0NRG83, 0DW6GX, 03WPHJ, 06HTRX, 0H1Y24, 02T9N6, 0TT5P2, 0CPKXG, 03015M, 061VPC, 0PC9H0, 0K3G34, 0PC0V5, 08NVYK ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; ++ DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1; ++ ++ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; ++ DIMM_B4: 1.0.1; DIMM_B5: 1.1.1; DIMM_B6: 1.2.1; ++ ++#PowerEdge FC420, M420 ++ Model: 0DPJGD, 068CTP, 0MN3VC, 0417VP ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; ++ ++ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; ++ ++# 4-socket ++# # PowerEdge M820 ++ Model: 0RN9TC, 0YWR73, 066N7P, 0PFG1N, 0JC2W3 ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0; ++ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1; ++ DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2; ++ ++ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0; ++ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1; ++ DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2; ++ ++ DIMM_C1: 2.0.0; DIMM_C2: 2.1.0; DIMM_C3: 2.2.0; DIMM_C4: 2.3.0; ++ DIMM_C5: 2.0.1; DIMM_C6: 2.1.1; DIMM_C7: 2.2.1; DIMM_C8: 2.3.1; ++ DIMM_C9: 2.0.2; DIMM_C10: 2.1.2; DIMM_C11: 2.2.2; DIMM_C12: 2.3.2; ++ ++ DIMM_D1: 3.0.0; DIMM_D2: 3.1.0; DIMM_D3: 3.2.0; DIMM_D4: 3.3.0; ++ DIMM_D5: 3.0.1; DIMM_D6: 3.1.1; DIMM_D7: 3.2.1; DIMM_D8: 3.3.1; ++ DIMM_D9: 3.0.2; DIMM_D10: 3.1.2; DIMM_D11: 3.2.2; DIMM_D12: 3.3.2; +-- +1.9.3 diff --git a/SOURCES/0041-rasdaemon-add-support-for-Haswell.patch b/SOURCES/0041-rasdaemon-add-support-for-Haswell.patch new file mode 100644 index 0000000..0344103 --- /dev/null +++ b/SOURCES/0041-rasdaemon-add-support-for-Haswell.patch @@ -0,0 +1,295 @@ +From 108b124a09512d44cd810d1ef6b823c9d029d5d6 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Mon, 18 May 2015 14:19:28 -0300 +Subject: [PATCH 01/13] rasdaemon: add support for Haswell + +Based on mcelog code. + +Acked-by: Tony Luck <tony.luck@intel,com> +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + Makefile.am | 2 +- + mce-intel-haswell.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + mce-intel.c | 2 + + ras-mce-handler.c | 8 +++ + ras-mce-handler.h | 3 + + 5 files changed, 208 insertions(+), 1 deletion(-) + create mode 100644 mce-intel-haswell.c + +diff --git a/Makefile.am b/Makefile.am +index 9c5f007..a6bf18f 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -28,7 +28,7 @@ if WITH_MCE + rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \ + mce-intel-p4-p6.c mce-intel-nehalem.c \ + mce-intel-dunnington.c mce-intel-tulsa.c \ +- mce-intel-sb.c mce-intel-ivb.c ++ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c + endif + if WITH_EXTLOG + rasdaemon_SOURCES += ras-extlog-handler.c +diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c +new file mode 100644 +index 0000000..c32704c +--- /dev/null ++++ b/mce-intel-haswell.c +@@ -0,0 +1,194 @@ ++/* ++ * The code below came from Tony Luck mcelog code, ++ * released under GNU Public General License, v.2 ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++ ++#include <string.h> ++#include <stdio.h> ++ ++#include "ras-mce-handler.h" ++#include "bitfield.h" ++ ++ ++/* See IA32 SDM Vol3B Table 16-20 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT", ++ [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT", ++ [0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT", ++ [0x13] = "MC_DMI_TRAINING_TIMEOUT", ++ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX", ++ [0x25] = "MC_SVID_COMMAN_TIMEOUT", ++ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID", ++ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF", ++ [0x44] = "MC_CRITICAL_VR_FAILED", ++ [0x45] = "MC_ICC_MAX_NOTSUPPORTED", ++ [0x46] = "MC_VID_RAMP_DOWN_FAILED", ++ [0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP", ++ [0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED", ++ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED", ++ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0", ++ [0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1", ++ [0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2", ++ [0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3", ++ [0x4F] = "MC_SVID_COMMAND_ERROR", ++ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT", ++ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT", ++ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED", ++ [0x58] = "MC_SVID_IMON_REQUEST_FAILED", ++ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED", ++ [0x60] = "MC_INVALID_PKGS_REQ_PCH", ++ [0x61] = "MC_INVALID_PKGS_REQ_QPI", ++ [0x62] = "MC_INVALID_PKGS_RSP_QPI", ++ [0x63] = "MC_INVALID_PKGS_RSP_PCH", ++ [0x64] = "MC_INVALID_PKG_STATE_CONFIG", ++ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT", ++ [0x68] = "MC_IMC_RW_SMBUS_TIMEOUT", ++ [0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED", ++ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE", ++ [0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER", ++ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ", ++ [0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT", ++ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT" ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-21 */ ++ ++static char *qpi[] = { ++ [0x02] = "Intel QPI physical layer detected drift buffer alarm", ++ [0x03] = "Intel QPI physical layer detected latency buffer rollover", ++ [0x10] = "Intel QPI link layer detected control error from R3QPI", ++ [0x11] = "Rx entered LLR abort state on CRC error", ++ [0x12] = "Unsupported or undefined packet", ++ [0x13] = "Intel QPI link layer control error", ++ [0x15] = "RBT used un-initialized value", ++ [0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization", ++ [0x21] = "Link failover data self healing", ++ [0x22] = "Phy detected in-band reset (no width change)", ++ [0x23] = "Link failover clock failover", ++ [0x30] = "Rx detected CRC error - successful LLR after Phy re-init", ++ [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init", ++}; ++ ++static struct field qpi_mc[] = { ++ FIELD(16, qpi), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-22 */ ++ ++static struct field memctrl_mc9[] = { ++ SBITFIELD(16, "DDR3 address parity error"), ++ SBITFIELD(17, "Uncorrected HA write data error"), ++ SBITFIELD(18, "Uncorrected HA data byte enable error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Corrected memory read error"), ++ SBITFIELD(24, "iMC write data buffer parity error"), ++ SBITFIELD(25, "DDR4 command address parity error"), ++ {} ++}; ++ ++void hsw_decode_model(struct ras_events *ras, struct mce_event *e) ++{ ++ uint64_t status = e->status; ++ uint32_t mca = status & 0xffff; ++ unsigned rank0 = -1, rank1 = -1, chan; ++ ++ switch (e->bank) { ++ case 4: ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ /* Internal errors */ ++ break; ++ case 0x406: ++ /* Intel TXT errors */ ++ break; ++ case 0x407: ++ /* Other UBOX Internal errors */ ++ break; ++ } ++ if (EXTRACT(status, 16, 19)) ++ /* PCU internal error */ ++ decode_bitfield(e, status, pcu_mc4); ++ break; ++ case 5: ++ case 20: ++ case 21: ++ decode_bitfield(e, status, qpi_mc); ++ break; ++ case 9: case 10: case 11: case 12: ++ case 13: case 14: case 15: case 16: ++ decode_bitfield(e, status, memctrl_mc9); ++ break; ++ } ++ ++ /* ++ * Memory error specific code. Returns if the error is not a MC one ++ */ ++ ++ /* Check if the error is at the memory controller */ ++ if ((mca >> 7) != 1) ++ return; ++ ++ /* Ignore unless this is an corrected extended error from an iMC bank */ ++ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || ++ !test_prefix(7, status & 0xefff)) ++ return; ++ ++ /* ++ * Parse the reported channel and ranks ++ */ ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ return; ++ ++ mce_snprintf(e->mc_location, "memory_channel=%d", chan); ++ ++ if (EXTRACT(e->misc, 62, 62)) ++ rank0 = EXTRACT(e->misc, 46, 50); ++ ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ ++ /* ++ * FIXME: The conversion from rank to dimm requires to parse the ++ * DMI tables and call failrank2dimm(). ++ */ ++ if (rank0 >= 0 && rank1 >= 0) ++ mce_snprintf(e->mc_location, "ranks=%d and %d", ++ rank0, rank1); ++ else if (rank0 >= 0) ++ mce_snprintf(e->mc_location, "rank=%d", rank0); ++ else ++ mce_snprintf(e->mc_location, "rank=%d", rank1); ++} ++ +diff --git a/mce-intel.c b/mce-intel.c +index 427b98e..1546a1d 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -392,6 +392,8 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) + case CPU_IVY_BRIDGE_EPEX: + ivb_decode_model(ras, e); + break; ++ case CPU_HASWELL_EPEX: ++ hsw_decode_model(ras, e); + default: + break; + } +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index a1d0b5d..d2de096 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -47,6 +47,8 @@ static char *cputype_name[] = { + [CPU_SANDY_BRIDGE_EP] = "Sandy Bridge EP", /* Fill in better name */ + [CPU_IVY_BRIDGE] = "Ivy Bridge", /* Fill in better name */ + [CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */ ++ [CPU_HASWELL] = "Haswell", ++ [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", + }; + + static enum cputype select_intel_cputype(struct ras_events *ras) +@@ -81,6 +83,12 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_IVY_BRIDGE; + else if (mce->model == 0x3e) + return CPU_IVY_BRIDGE_EPEX; ++ else if (mce->model == 0x3c || mce->model == 0x45 || ++ mce->model == 0x46) ++ return CPU_HASWELL; ++ else if (mce->model == 0x3f) ++ return CPU_HASWELL_EPEX; ++ + if (mce->model > 0x1a) { + log(ALL, LOG_INFO, + "Family 6 Model %x CPU: only decoding architectural errors\n", +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 80e9769..b8b3d4f 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -42,6 +42,8 @@ enum cputype { + CPU_SANDY_BRIDGE_EP, + CPU_IVY_BRIDGE, + CPU_IVY_BRIDGE_EPEX, ++ CPU_HASWELL, ++ CPU_HASWELL_EPEX, + }; + + struct mce_event { +@@ -114,6 +116,7 @@ void xeon75xx_decode_model(struct mce_event *e); + void dunnington_decode_model(struct mce_event *e); + void snb_decode_model(struct ras_events *ras, struct mce_event *e); + void ivb_decode_model(struct ras_events *ras, struct mce_event *e); ++void hsw_decode_model(struct ras_events *ras, struct mce_event *e); + void tulsa_decode_model(struct mce_event *e); + + /* Software defined banks */ +-- +1.8.3.1 + diff --git a/SOURCES/0042-rasdaemon-decode-new-simple-error-code-number-6.patch b/SOURCES/0042-rasdaemon-decode-new-simple-error-code-number-6.patch new file mode 100644 index 0000000..0691768 --- /dev/null +++ b/SOURCES/0042-rasdaemon-decode-new-simple-error-code-number-6.patch @@ -0,0 +1,40 @@ +From 85a2ead8f2d6e380be8d8234ba752a558e8027ed Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Mon, 18 May 2015 14:19:29 -0300 +Subject: [PATCH 02/13] rasdaemon: decode new simple error code number 6 + +This patch was based on fa313dd0144596dfa140bd66805367250d6eae9b +(mcelog) + + mcelog: Decode new simple error code number 6 + + Edition 050 of the Intel SDM released in late February 2014 + includes a new simple error code in "Table 15-8. IA32_MCi_Status + [15:0] Simple Error Code Encoding". Code 6 (0000 0000 0000 0110) + has been allocated for the reporting of cases where the BIOS SMM + code attempts to execute code outside of the protected SMRR area. + + Signed-off-by: Tony Luck <tony.luck@intel.com> + Signed-off-by: Andi Kleen <ak@linux.intel.com> + +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + mce-intel.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/mce-intel.c b/mce-intel.c +index 1546a1d..69ea00e 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -115,6 +115,7 @@ static char *mca_msg[] = { + [3] = "External error", + [4] = "FRC error", + [5] = "Internal parity error", ++ [6] = "SMM Handler Code Access Violation", + }; + + static char *tracking_msg[] = { +-- +1.8.3.1 + diff --git a/SOURCES/0043-rasdaemon-Add-missing-entry-to-Ivy-Bridge-memory-con.patch b/SOURCES/0043-rasdaemon-Add-missing-entry-to-Ivy-Bridge-memory-con.patch new file mode 100644 index 0000000..0cb3df2 --- /dev/null +++ b/SOURCES/0043-rasdaemon-Add-missing-entry-to-Ivy-Bridge-memory-con.patch @@ -0,0 +1,38 @@ +From 064a74b1202e529b5e16a54218fc17974906af2d Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Mon, 18 May 2015 14:19:30 -0300 +Subject: [PATCH 03/13] rasdaemon: Add missing entry to Ivy Bridge memory + controller decode table + +This patch is based on 2577aeb662374cb87169ee675b2e37c06f1aed99 (mcelog) + + mcelog: Add missing entry to Ivy Bridge memory controller decode table + + September 2013 edition of the software developer manual added an + entry that had been inadvertently omitted from earlier editions. + Add the 0x80 entry for "Corrected memory read error". + + Signed-off-by: Tony Luck <tony.luck@intel.com> + Signed-off-by: Andi Kleen <ak@linux.intel.com> + +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + mce-intel-ivb.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/mce-intel-ivb.c b/mce-intel-ivb.c +index f2a133a..0c5bebc 100644 +--- a/mce-intel-ivb.c ++++ b/mce-intel-ivb.c +@@ -76,6 +76,7 @@ static char *memctrl_1[] = { + [0x010] = "Uncorrected patrol scrub error", + [0x020] = "Corrected spare error", + [0x040] = "Uncorrected spare error", ++ [0x080] = "Corrected memory read error", + [0x100] = "iMC, WDB, parity errors", + }; + +-- +1.8.3.1 + diff --git a/SOURCES/0044-rasdaemon-Identify-Ivy-Bridge-properly.patch b/SOURCES/0044-rasdaemon-Identify-Ivy-Bridge-properly.patch new file mode 100644 index 0000000..27aee96 --- /dev/null +++ b/SOURCES/0044-rasdaemon-Identify-Ivy-Bridge-properly.patch @@ -0,0 +1,38 @@ +From 66021c20c92b5df16b5c8dae4fb664788fa40376 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Mon, 18 May 2015 14:19:31 -0300 +Subject: [PATCH 04/13] rasdaemon: Identify Ivy Bridge properly + +This patch is based on b29cc4d615cead87cbc163ada0645b10c5b1217d (mcelog) + mcelog: Identify Ivy Bridge properly + + Uniquely identify Ivy Bridge even though the machine checks are the same + for Sandy Bridge and Ivy Bridge. This makes the output for the processor + display "Ivy Bridge". + + Signed-off-by: Prarit Bhargava <prarit@redhat.com> + Cc: tony.luck@intel.com + Signed-off-by: Andi Kleen <ak@linux.intel.com> + +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + ras-mce-handler.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index d2de096..07e298f 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -75,7 +75,7 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_NEHALEM; + else if (mce->model == 0x2e || mce->model == 0x2f) + return CPU_XEON75XX; +- else if (mce->model == 0x2a || mce->model == 0x3a) ++ else if (mce->model == 0x2a) + return CPU_SANDY_BRIDGE; + else if (mce->model == 0x2d) + return CPU_SANDY_BRIDGE_EP; +-- +1.8.3.1 + diff --git a/SOURCES/0045-rasdaemon-add-support-for-Broadwell.patch b/SOURCES/0045-rasdaemon-add-support-for-Broadwell.patch new file mode 100644 index 0000000..ce568d3 --- /dev/null +++ b/SOURCES/0045-rasdaemon-add-support-for-Broadwell.patch @@ -0,0 +1,52 @@ +From a9810094cf838e03102f95333db7ddfe810ccabd Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Mon, 18 May 2015 14:19:32 -0300 +Subject: [PATCH 05/13] rasdaemon: add support for Broadwell + +Only basic support for now. + +Based on mcelog code. + +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + ras-mce-handler.c | 3 +++ + ras-mce-handler.h | 1 + + 2 files changed, 4 insertions(+) + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 07e298f..e059b92 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -49,6 +49,7 @@ static char *cputype_name[] = { + [CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */ + [CPU_HASWELL] = "Haswell", + [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", ++ [CPU_BROADWELL] = "Broadwell", + }; + + static enum cputype select_intel_cputype(struct ras_events *ras) +@@ -88,6 +89,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_HASWELL; + else if (mce->model == 0x3f) + return CPU_HASWELL_EPEX; ++ else if (mce->model == 0x3d) ++ return CPU_BROADWELL; + + if (mce->model > 0x1a) { + log(ALL, LOG_INFO, +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index b8b3d4f..ba01f55 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -44,6 +44,7 @@ enum cputype { + CPU_IVY_BRIDGE_EPEX, + CPU_HASWELL, + CPU_HASWELL_EPEX, ++ CPU_BROADWELL, + }; + + struct mce_event { +-- +1.8.3.1 + diff --git a/SOURCES/0046-rasdaemon-add-support-for-Knights-Landing.patch b/SOURCES/0046-rasdaemon-add-support-for-Knights-Landing.patch new file mode 100644 index 0000000..a6f4367 --- /dev/null +++ b/SOURCES/0046-rasdaemon-add-support-for-Knights-Landing.patch @@ -0,0 +1,50 @@ +From bd6c78d89f4e934fafb1136a15efc0d6df4635ed Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Mon, 18 May 2015 14:19:33 -0300 +Subject: [PATCH 06/13] rasdaemon: add support for Knights Landing + +Patch based on mcelog. + +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + ras-mce-handler.c | 3 +++ + ras-mce-handler.h | 1 + + 2 files changed, 4 insertions(+) + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index e059b92..63f14fd 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -50,6 +50,7 @@ static char *cputype_name[] = { + [CPU_HASWELL] = "Haswell", + [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", + [CPU_BROADWELL] = "Broadwell", ++ [CPU_KNIGHTS_LANDING] = "Knights Landing", + }; + + static enum cputype select_intel_cputype(struct ras_events *ras) +@@ -91,6 +92,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_HASWELL_EPEX; + else if (mce->model == 0x3d) + return CPU_BROADWELL; ++ else if (mce->model == 0x57) ++ return CPU_KNIGHTS_LANDING; + + if (mce->model > 0x1a) { + log(ALL, LOG_INFO, +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index ba01f55..28aad00 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -45,6 +45,7 @@ enum cputype { + CPU_HASWELL, + CPU_HASWELL_EPEX, + CPU_BROADWELL, ++ CPU_KNIGHTS_LANDING, + }; + + struct mce_event { +-- +1.8.3.1 + diff --git a/SOURCES/0047-rasdaemon-properly-pring-message-strings-in-decode_b.patch b/SOURCES/0047-rasdaemon-properly-pring-message-strings-in-decode_b.patch new file mode 100644 index 0000000..12d58d2 --- /dev/null +++ b/SOURCES/0047-rasdaemon-properly-pring-message-strings-in-decode_b.patch @@ -0,0 +1,33 @@ +From 5dd11c60b84294a3c6ce5ccb0db726b3dce35b10 Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Date: Tue, 26 May 2015 11:59:36 -0300 +Subject: [PATCH 07/13] rasdaemon: properly pring message strings in + decode_bitfield() + +Fix decode_bitfield() so that it does print message strings from the struct +field table. + +Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Signed-off-by: Aristeu Rozanski <aris@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + bitfield.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/bitfield.c b/bitfield.c +index 1dda30d..d6931c9 100644 +--- a/bitfield.c ++++ b/bitfield.c +@@ -84,7 +84,8 @@ void decode_bitfield(struct mce_event *e, uint64_t status, + continue; + mce_snprintf(e->error_msg, "<%u:%llx>", + f->start_bit, (long long)v); +- } ++ } else ++ mce_snprintf(e->error_msg, "%s", s); + } + } + +-- +1.8.3.1 + diff --git a/SOURCES/0048-rasdaemon-add-missing-semicolon-in-hsw_decode_model.patch b/SOURCES/0048-rasdaemon-add-missing-semicolon-in-hsw_decode_model.patch new file mode 100644 index 0000000..b956655 --- /dev/null +++ b/SOURCES/0048-rasdaemon-add-missing-semicolon-in-hsw_decode_model.patch @@ -0,0 +1,31 @@ +From abf36efe909c4022260cb4016c54d1ec3ec18cb8 Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Date: Tue, 26 May 2015 11:59:37 -0300 +Subject: [PATCH 08/13] rasdaemon: add missing semicolon in hsw_decode_model() + +hsw_decode_model() tries to skip decode_bitfield() if IA32_MC4_STATUS indicates +some internal errors. Unfortunately, here behaves opposite to the intention +because a semicolon is missing. + +Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Signed-off-by: Aristeu Rozanski <aris@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + mce-intel-haswell.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c +index c32704c..3ac12f2 100644 +--- a/mce-intel-haswell.c ++++ b/mce-intel-haswell.c +@@ -137,6 +137,7 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e) + } + if (EXTRACT(status, 16, 19)) + /* PCU internal error */ ++ ; + decode_bitfield(e, status, pcu_mc4); + break; + case 5: +-- +1.8.3.1 + diff --git a/SOURCES/0049-rasdaemon-enable-IMC-status-usage-for-Haswell-E.patch b/SOURCES/0049-rasdaemon-enable-IMC-status-usage-for-Haswell-E.patch new file mode 100644 index 0000000..24ec908 --- /dev/null +++ b/SOURCES/0049-rasdaemon-enable-IMC-status-usage-for-Haswell-E.patch @@ -0,0 +1,43 @@ +From f892a390c55c0b350c57cda9d166a9cf331aa36f Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Date: Tue, 26 May 2015 11:59:38 -0300 +Subject: [PATCH 09/13] rasdaemon: enable IMC status usage for Haswell-E + +Enable IMC status bank for Haswell-E, as described in Intel SDM Vol.3C +Table 35-27. + +Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Signed-off-by: Aristeu Rozanski <aris@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + mce-intel.c | 1 + + ras-mce-handler.c | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/mce-intel.c b/mce-intel.c +index 69ea00e..3684602 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -457,6 +457,7 @@ int set_intel_imc_log(enum cputype cputype, unsigned ncpus) + switch (cputype) { + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: ++ case CPU_HASWELL_EPEX: + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 63f14fd..fb6db8a 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -221,6 +221,7 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus) + switch (mce->cputype) { + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: ++ case CPU_HASWELL_EPEX: + set_intel_imc_log(mce->cputype, ncpus); + default: + break; +-- +1.8.3.1 + diff --git a/SOURCES/0050-rasdaemon-make-sure-the-error-is-valid-before-handli.patch b/SOURCES/0050-rasdaemon-make-sure-the-error-is-valid-before-handli.patch new file mode 100644 index 0000000..9c57427 --- /dev/null +++ b/SOURCES/0050-rasdaemon-make-sure-the-error-is-valid-before-handli.patch @@ -0,0 +1,54 @@ +From 56913e2f2a5a6ddf8ab684c8d528e9ef1d55cfba Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Date: Tue, 26 May 2015 11:59:39 -0300 +Subject: [PATCH 10/13] rasdaemon: make sure the error is valid before handling + ranks + +Fix "rank" handling according to the Bit 63 description in Intel SDM Vol.3C +Table 16-23, that says "... Use this information only after there is valid +first error info indicated by bit 62". +Also fix invalid comparisons of unsigned variables "rank0" and "rank1". + +Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Signed-off-by: Aristeu Rozanski <aris@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + mce-intel-haswell.c | 14 ++++++-------- + 1 file changed, 6 insertions(+), 8 deletions(-) + +diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c +index 3ac12f2..0a817bf 100644 +--- a/mce-intel-haswell.c ++++ b/mce-intel-haswell.c +@@ -174,22 +174,20 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e) + + mce_snprintf(e->mc_location, "memory_channel=%d", chan); + +- if (EXTRACT(e->misc, 62, 62)) ++ if (EXTRACT(e->misc, 62, 62)) { + rank0 = EXTRACT(e->misc, 46, 50); +- +- if (EXTRACT(e->misc, 63, 63)) +- rank1 = EXTRACT(e->misc, 51, 55); ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ } + + /* + * FIXME: The conversion from rank to dimm requires to parse the + * DMI tables and call failrank2dimm(). + */ +- if (rank0 >= 0 && rank1 >= 0) ++ if (rank0 != -1 && rank1 != -1) + mce_snprintf(e->mc_location, "ranks=%d and %d", + rank0, rank1); +- else if (rank0 >= 0) ++ else if (rank0 != -1) + mce_snprintf(e->mc_location, "rank=%d", rank0); +- else +- mce_snprintf(e->mc_location, "rank=%d", rank1); + } + +-- +1.8.3.1 + diff --git a/SOURCES/0051-rasdaemon-add-support-to-match-the-machine-by-system.patch b/SOURCES/0051-rasdaemon-add-support-to-match-the-machine-by-system.patch new file mode 100644 index 0000000..7237eed --- /dev/null +++ b/SOURCES/0051-rasdaemon-add-support-to-match-the-machine-by-system.patch @@ -0,0 +1,261 @@ +From 3a38f8e66a2aa5c477cea152e1acc9a781834b83 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <aris@redhat.com> +Date: Mon, 1 Jun 2015 17:04:00 -0300 +Subject: [PATCH 11/13] rasdaemon: add support to match the machine by system's + product name + +In some cases the motherboard names will change but the mapping won't +across a line of products. This patch adds support for "Product:" to be +specified in the label files instead of Model:. + +An example: + Vendor: Dell Inc. + Product: PowerEdge R610 + DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2; + DIMM_A4: 0.1.0; DIMM_A5: 0.1.1; DIMM_A6: 0.1.2; + + DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2; + DIMM_B4: 1.1.0; DIMM_B5: 1.1.1; DIMM_B6: 1.1.2; + +Would match all 'PowerEdge R610' machines. + +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + util/ras-mc-ctl.in | 127 +++++++++++++++++++++++++++++++++++++++++------------ + 1 file changed, 98 insertions(+), 29 deletions(-) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 7b6d798..6350f62 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -288,8 +288,27 @@ sub parse_dimm_nodes + } + } + ++sub guess_product { ++ my $pvendor = undef; ++ my $pname = undef; ++ ++ if (open (VENDOR, "/sys/class/dmi/id/product_vendor")) { ++ $pvendor = <VENDOR>; ++ close VENDOR; ++ chomp($pvendor); ++ } ++ if (open (NAME, "/sys/class/dmi/id/product_name")) { ++ $pname = <NAME>; ++ close NAME; ++ chomp($pname); ++ } ++ ++ return ($pvendor, $pname); ++} ++ + sub get_mainboard_info { + my ($vendor, $model); ++ my ($pvendor, $pname); + + if ($conf{opt}{mainboard} && $conf{opt}{mainboard} ne "report") { + ($vendor, $model) = split (/[: ]/, $conf{opt}{mainboard}, 2); +@@ -301,6 +320,15 @@ sub get_mainboard_info { + + $conf{mainboard}{vendor} = $vendor; + $conf{mainboard}{model} = $model; ++ ++ ($pvendor, $pname) = guess_product (); ++ # since product vendor is rare, use mainboard's vendor ++ if ($pvendor) { ++ $conf{mainboard}{product_vendor} = $pvendor; ++ } else { ++ $conf{mainboard}{product_vendor} = $vendor; ++ } ++ $conf{mainboard}{product_name} = $pname if $pname; + } + + sub guess_vendor_model_dmidecode { +@@ -449,10 +477,11 @@ sub guess_dimm_label { + + sub parse_dimm_labels_file + { +- my ($lh, $num_layers, $file) = (@_); ++ my ($lh, $num_layers, $lh_prod, $num_layers_prod, $file) = (@_); + my $line = -1; + my $vendor = ""; + my @models = (); ++ my @products = (); + my $num; + + open (LABELS, "$file") +@@ -469,12 +498,21 @@ sub parse_dimm_labels_file + if (/vendor\s*:\s*(.*\S)\s*/i) { + $vendor = lc $1; + @models = (); ++ @products = (); + $num = 0; + next; + } + if (/(model|board)\s*:\s*(.*)$/i) { + !$vendor && die "$file: line $line: MB model without vendor\n"; + @models = grep { s/\s*(.*)\s*$/$1/ } split(/[,;]+/, $2); ++ @products = (); ++ $num = 0; ++ next; ++ } ++ if (/(product)\s*:\s*(.*)$/i) { ++ !$vendor && die "$file: line $line: product without vendor\n"; ++ @models = (); ++ @products = grep { s/\s*(.*)\s*$/$1/ } split(/[,;]+/, $2); + $num = 0; + next; + } +@@ -513,10 +551,13 @@ sub parse_dimm_labels_file + } + map { $lh->{$vendor}{lc $_}{$mc}{$top}{$mid}{$low} = $label } + @models; ++ map { $lh_prod->{$vendor}{lc $_}{$mc}{$top}{$mid}{$low} = $label } ++ @products; + } + if (!$num) { + $num = $n; + map { $num_layers->{$vendor}{lc $_} = $num } @models; ++ map { $num_layers_prod->{$vendor}{lc $_} = $num } @products; + } elsif ($num != $n) { + die ("Error: Inconsistent number of layers at label db \"$file\"\n"); + } +@@ -531,6 +572,8 @@ sub parse_dimm_labels + { + my %labels = (); + my %num_layers = (); ++ my %labels_prod = (); ++ my %num_layers_prod = (); + + # + # Accrue all DIMM labels from the labels.db file, as +@@ -538,10 +581,10 @@ sub parse_dimm_labels + # + for my $file ($conf{labeldb}, <$conf{labeldir}/*>) { + next unless -r $file; +- parse_dimm_labels_file (\%labels, \%num_layers, $file); ++ parse_dimm_labels_file (\%labels, \%num_layers, \%labels_prod, \%num_layers_prod, $file); + } + +- return (\%labels, \%num_layers); ++ return (\%labels, \%num_layers, \%labels_prod, \%num_layers_prod); + } + + sub read_dimm_label +@@ -598,25 +641,9 @@ sub get_dimm_label_node + } + + +-sub print_dimm_labels ++sub _print_dimm_labels + { +- my $fh = shift || *STDOUT; +- my ($lref, $num_layers) = parse_dimm_labels (); +- my $vendor = lc $conf{mainboard}{vendor}; +- my $model = lc $conf{mainboard}{model}; +- my $format = "%-35s %-20s %-20s\n"; +- +- if (!exists $$lref{$vendor}{$model}) { +- log_error ("No dimm labels for $conf{mainboard}{vendor} " . +- "model $conf{mainboard}{model}\n"); +- return; +- } +- +- my $sysfs_dir = "/sys/devices/system/edac/mc"; +- +- find({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir); +- +- printf $fh $format, "LOCATION", "CONFIGURED LABEL", "SYSFS CONTENTS"; ++ my ($lref, $num_layers, $vendor, $model, $fh, $format) = @_; + + for my $mc (sort keys %{$$lref{$vendor}{$model}}) { + for my $top (sort keys %{$$lref{$vendor}{$model}{$mc}}) { +@@ -631,26 +658,40 @@ sub print_dimm_labels + } + } + print $fh "\n"; +- + } + +-sub register_dimm_labels ++sub print_dimm_labels + { +- my ($lref, $num_layers) = parse_dimm_labels (); ++ my $fh = shift || *STDOUT; ++ my ($lref, $num_layers, $lref_prod, $num_layers_prod) = parse_dimm_labels (); + my $vendor = lc $conf{mainboard}{vendor}; + my $model = lc $conf{mainboard}{model}; +- my $sysfs = "/sys/devices/system/edac/mc"; ++ my $pvendor = lc $conf{mainboard}{product_vendor}; ++ my $pname = lc $conf{mainboard}{product_name}; ++ my $format = "%-35s %-20s %-20s\n"; + +- if (!exists $$lref{$vendor}{$model}) { ++ if (!exists $$lref{$vendor}{$model} && !exists $$lref_prod{$pvendor}{$pname}) { + log_error ("No dimm labels for $conf{mainboard}{vendor} " . +- "model $conf{mainboard}{model}\n"); +- return 0; ++ "model $conf{mainboard}{model}\n"); ++ return; + } ++ + my $sysfs_dir = "/sys/devices/system/edac/mc"; + + find({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir); + +- select (undef, undef, undef, $conf{opt}{delay}); ++ printf $fh $format, "LOCATION", "CONFIGURED LABEL", "SYSFS CONTENTS"; ++ ++ if (exists $$lref{$vendor}{$model}) { ++ _print_dimm_labels($lref, $num_layers, $vendor, $model, $fh, $format); ++ } elsif (exists $$lref_prod{$pvendor}{$pname}) { ++ _print_dimm_labels($lref_prod, $num_layers_prod, $pvendor, $pname, $fh, $format); ++ } ++} ++ ++sub write_dimm_labels ++{ ++ my ($lref, $num_layers, $vendor, $model) = @_; + + for my $mc (sort keys %{$$lref{$vendor}{$model}}) { + for my $top (sort keys %{$$lref{$vendor}{$model}{$mc}}) { +@@ -675,6 +716,34 @@ sub register_dimm_labels + } + } + } ++} ++ ++sub register_dimm_labels ++{ ++ my ($lref, $num_layers, $lref_prod, $num_layers_prod) = parse_dimm_labels (); ++ my $vendor = lc $conf{mainboard}{vendor}; ++ my $model = lc $conf{mainboard}{model}; ++ my $pvendor = lc $conf{mainboard}{product_vendor}; ++ my $pname = lc $conf{mainboard}{product_name}; ++ my $sysfs = "/sys/devices/system/edac/mc"; ++ ++ if (!exists $$lref{$vendor}{$model} && !exists $$lref_prod{$pvendor}{$pname}) { ++ log_error ("No dimm labels for $conf{mainboard}{vendor} " . ++ "model $conf{mainboard}{model}\n"); ++ return 0; ++ } ++ my $sysfs_dir = "/sys/devices/system/edac/mc"; ++ ++ find({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir); ++ ++ select (undef, undef, undef, $conf{opt}{delay}); ++ ++ if (exists $$lref{$vendor}{$model}) { ++ write_dimm_labels($lref, $num_layers, $vendor, $model); ++ } else { ++ write_dimm_labels($lref_prod, $num_layers_prod, $pvendor, $pname); ++ } ++ + return 1; + } + +-- +1.8.3.1 + diff --git a/SOURCES/0052-rasdaemon-add-internal-errors-of-IA32_MC4_STATUS-for.patch b/SOURCES/0052-rasdaemon-add-internal-errors-of-IA32_MC4_STATUS-for.patch new file mode 100644 index 0000000..988de6e --- /dev/null +++ b/SOURCES/0052-rasdaemon-add-internal-errors-of-IA32_MC4_STATUS-for.patch @@ -0,0 +1,48 @@ +From a50a2ae341f8821d71a19d9a3c6ca345e1499e25 Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Date: Wed, 17 Jun 2015 07:56:57 -0300 +Subject: [PATCH 5/5] rasdaemon: add internal errors of IA32_MC4_STATUS for + Haswell + +Now rasdaemon looks purposely omitting internal errors of +IA32_MC4_STATUS for Haswell-family processors, which are described in +Intel SDM vol3 Table 16-20. I think it's better to show these errors +because mcelog does show them. + +Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + mce-intel-haswell.c | 11 +++++------ + 1 file changed, 5 insertions(+), 6 deletions(-) + +diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c +index 0a817bf..b70e399 100644 +--- a/mce-intel-haswell.c ++++ b/mce-intel-haswell.c +@@ -126,18 +126,17 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e) + case 4: + switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { + case 0x402: case 0x403: +- /* Internal errors */ ++ mce_snprintf(e->mcastatus_msg, "PCU Internal Errors"); + break; + case 0x406: +- /* Intel TXT errors */ ++ mce_snprintf(e->mcastatus_msg, "Intel TXT Errors"); + break; + case 0x407: +- /* Other UBOX Internal errors */ ++ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal Errors"); + break; + } +- if (EXTRACT(status, 16, 19)) +- /* PCU internal error */ +- ; ++ if (EXTRACT(status, 16, 17) && !EXTRACT(status, 18, 19)) ++ mce_snprintf(e->error_msg, "PCU Internal error"); + decode_bitfield(e, status, pcu_mc4); + break; + case 5: +-- +1.8.3.1 + diff --git a/SOURCES/0053-rasdaemon-remove-a-space-from-mcgstatus_msg.patch b/SOURCES/0053-rasdaemon-remove-a-space-from-mcgstatus_msg.patch new file mode 100644 index 0000000..5aaae10 --- /dev/null +++ b/SOURCES/0053-rasdaemon-remove-a-space-from-mcgstatus_msg.patch @@ -0,0 +1,34 @@ +From 45b575b791dbd3d5660a0c08065a9fbcb6e21eb9 Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Date: Wed, 10 Jun 2015 07:29:03 -0300 +Subject: [PATCH 2/5] rasdaemon: remove a space from mcgstatus_msg + +"ras-mc-ctl --errors" shows an unnecessary space character in the +mcgstatus string of MCE event, like below: + +2 2015-04-04 19:57:22 +0900 error: MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x8000000067000e0b, walltime=0x555da140, cpu=0x00000001, cpuid=0x000306f3, apicid=0x00000002, bank=0x00000004 + +Let's remove it. + +Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + mce-intel.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mce-intel.c b/mce-intel.c +index 3503c6a..77b929b 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -176,7 +176,7 @@ static void decode_mcg(struct mce_event *e) + { + uint64_t mcgstatus = e->mcgstatus; + +- mce_snprintf(e->mcgstatus_msg, "mcgstatus= %lld", ++ mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld", + (long long)e->mcgstatus); + + if (mcgstatus & MCG_STATUS_RIPV) +-- +1.8.3.1 + diff --git a/SOURCES/0054-rasdaemon-unnecessary-comma-for-empty-mc_location-st.patch b/SOURCES/0054-rasdaemon-unnecessary-comma-for-empty-mc_location-st.patch new file mode 100644 index 0000000..958874d --- /dev/null +++ b/SOURCES/0054-rasdaemon-unnecessary-comma-for-empty-mc_location-st.patch @@ -0,0 +1,36 @@ +From 349da4c3d63ec6dceef66a405561984561d31582 Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Date: Wed, 10 Jun 2015 20:49:55 -0300 +Subject: [PATCH 3/5] rasdaemon: unnecessary comma for empty mc_location string + +Into the /var/log/messages, rasdaemon sometimes prints an unnecessary +comma ", " between mca= and cpu_type= like below: + +Jun 9 02:44:39 localhost rasdaemon: <...>-4585 [1638893312] 1031.109000: mce_record: 2015-06-08 10:07:28 +0900 bank=3, status= 9c0000000000017a, mci=Corrected_error Error_enabled, mca=Generic CACHE Level-2 Eviction Error, , cpu_type= Intel Xeon v3 (Haswell) EP/EX, cpu= 1, socketid= 0, misc= 4004000000000080, addr= 204fffffff, mcgstatus= 0, mcgcap= 7000c16, apicid= 2 + +That's the comma for mc_location which is printed even if mc_location is +empty due to a wrong if condition. + +Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Acked-by: Aristeu Rozanski <aris@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + ras-mce-handler.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index fb6db8a..07252a0 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -278,7 +278,7 @@ static void report_mce_event(struct ras_events *ras, + if (*e->user_action) + trace_seq_printf(s, " %s", e->user_action); + +- if (e->mc_location) ++ if (*e->mc_location) + trace_seq_printf(s, ", %s", e->mc_location); + + #if 0 +-- +1.8.3.1 + diff --git a/SOURCES/0055-rasdaemon-use-MCA-error-msg-as-error_msg.patch b/SOURCES/0055-rasdaemon-use-MCA-error-msg-as-error_msg.patch new file mode 100644 index 0000000..1fbff68 --- /dev/null +++ b/SOURCES/0055-rasdaemon-use-MCA-error-msg-as-error_msg.patch @@ -0,0 +1,57 @@ +From 9136d7422a6b53c50a920f3dd2539bf7fcd4fdf5 Mon Sep 17 00:00:00 2001 +From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Date: Fri, 12 Jun 2015 06:35:37 -0300 +Subject: [PATCH 4/5] rasdaemon: use MCA error msg as error_msg + +In the case of machine-checks which do not have a model-specific MCA +error code but have an architectural code only, mce_event.error_msg +becomes empty then you don't know what happened. + +(snip) +MCE records summary: + 1 errors + ^ + empty! + +(snip) +MCE events: +1 2015-06-12 00:21:46 +0900 error: , mcg mcgstatus= 0, mci Corrected_error + ^ + empty! + +Error_enabled, mcgcap=0x07000c16, status=0x9c0000000000017a, addr=0x204fffffff, misc=0x4004000000000080, walltime=0x557b0db2, cpu=0x00000001, cpuid=0x000306f3, apicid=0x00000002, bank=0x00000003 + +In such a case, let's use the content of mcastatus_msg as error_msg +instead. + +(snip) +MCE records summary: + 1 Generic CACHE Level-2 Eviction Error errors +(snip) +MCE events: +1 2015-06-12 02:39:04 +0900 error: Generic CACHE Level-2 Eviction Error, mcg mcgstatus= 0, mci Corrected_error Error_enabled, mcgcap=0x07000c16, status=0x9c0000000000017a, addr=0x204fffffff, misc=0x4004000000000080, walltime=0x557b1f22, cpu=0x00000001, cpuid=0x000306f3, apicid=0x00000002, bank=0x00000003 + +Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> +Acked-by: Aristeu Rozanski <aris@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + ras-mce-handler.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 07252a0..3976f90 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -411,6 +411,9 @@ int ras_mce_event_handler(struct trace_seq *s, + if (rc) + return rc; + ++ if (!*e.error_msg && *e.mcastatus_msg) ++ mce_snprintf(e.error_msg, "%s", e.mcastatus_msg); ++ + report_mce_event(ras, record, s, &e); + + #ifdef HAVE_SQLITE3 +-- +1.8.3.1 + diff --git a/SOURCES/0056-x86-rasdaemon-Add-support-to-log-Local-Machine-Check.patch b/SOURCES/0056-x86-rasdaemon-Add-support-to-log-Local-Machine-Check.patch new file mode 100644 index 0000000..ee1163a --- /dev/null +++ b/SOURCES/0056-x86-rasdaemon-Add-support-to-log-Local-Machine-Check.patch @@ -0,0 +1,50 @@ +From fa6260eb1304c6c829af177ab4aa1937db36fab1 Mon Sep 17 00:00:00 2001 +From: Ashok Raj <ashok.raj@intel.com> +Date: Fri, 5 Jun 2015 13:32:47 -0300 +Subject: [PATCH 1/5] x86, rasdaemon: Add support to log Local Machine Check + Exception (LMCE) + +Local Machine Check Exception allows certain errors to be signaled to +only the affected logical processor. This change captures them for +rasdaemon. + +log:Changes to rasdaemon to support new architectural changes to MCE + +Changet to rasdaemon to support new architectural extentions in Intel +CPUs. + +Signed-off-by: Ashok Raj <ashok.raj@intel.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + mce-intel.c | 2 ++ + ras-mce-handler.h | 1 + + 2 files changed, 3 insertions(+) + +diff --git a/mce-intel.c b/mce-intel.c +index 3684602..3503c6a 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -185,6 +185,8 @@ static void decode_mcg(struct mce_event *e) + mce_snprintf(e->mcgstatus_msg, "EIPV"); + if (mcgstatus & MCG_STATUS_MCIP) + mce_snprintf(e->mcgstatus_msg, "MCIP"); ++ if (mcgstatus & MCG_STATUS_LMCE) ++ mce_snprintf(e->mcgstatus_msg, "LMCE"); + } + + static void bank_name(struct mce_event *e) +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 28aad00..13b8f52 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -139,6 +139,7 @@ void tulsa_decode_model(struct mce_event *e); + #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ + #define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */ + #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ ++#define MCG_STATUS_LMCE (1ULL<<3) /* local machine check signaled */ + + /* Those functions are defined on per-cpu vendor C files */ + int parse_intel_event(struct ras_events *ras, struct mce_event *e); +-- +1.8.3.1 + diff --git a/SOURCES/0057-rasdaemon-add-support-for-haswell-ex.patch b/SOURCES/0057-rasdaemon-add-support-for-haswell-ex.patch new file mode 100644 index 0000000..9ad3c0f --- /dev/null +++ b/SOURCES/0057-rasdaemon-add-support-for-haswell-ex.patch @@ -0,0 +1,22 @@ +Based on mcelog code. + +Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com> + +--- + ras-mce-handler.c | 3 ++- + 1 files changed, 2 insertions(+), 1 deletions(-) + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 3976f90..23f2488 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -90,7 +90,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_HASWELL; + else if (mce->model == 0x3f) + return CPU_HASWELL_EPEX; +- else if (mce->model == 0x3d) ++ else if (mce->model == 0x3d || mce->model == 0x4f || ++ mce->model == 0x56) + return CPU_BROADWELL; + else if (mce->model == 0x57) + return CPU_KNIGHTS_LANDING; diff --git a/SOURCES/0058-rasdaemon-fix-typos-on-ras-mc-ctl-man-page.patch b/SOURCES/0058-rasdaemon-fix-typos-on-ras-mc-ctl-man-page.patch new file mode 100644 index 0000000..2fb8639 --- /dev/null +++ b/SOURCES/0058-rasdaemon-fix-typos-on-ras-mc-ctl-man-page.patch @@ -0,0 +1,43 @@ +From d9fe70fe7db45618f7b46b81ebee85e7a8801870 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <aris@redhat.com> +Date: Mon, 10 Aug 2015 14:24:41 -0400 +Subject: [PATCH 1/5] rasdaemon: fix typos on ras-mc-ctl man page + +Fixed two markers and two typos in the documentation. + +Signed-off-by: Aristeu Rozanski <aris@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + man/ras-mc-ctl.8.in | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/man/ras-mc-ctl.8.in b/man/ras-mc-ctl.8.in +index 7441b3a..60997dd 100644 +--- a/man/ras-mc-ctl.8.in ++++ b/man/ras-mc-ctl.8.in +@@ -69,14 +69,14 @@ Display the configured labels for the current hardware, as + well as the current labels registered with EDAC. + .TP + .BI "--guess-labels" +-Print DMI labels, when bank locator is available at the DMI table. ++Print DMI labels, when bank locator is available in the DMI table. + It helps to fill the labels database at @sysconfdir@/ras/dimm_labels.d/. + .TP + .BI "--labeldb="DB + Specify an alternate location for the labels database. + .TP + .BI "--delay="time +-Specify a delay of \ftime\fR seconds before registering dimm labels. ++Specify a delay of \fBtime\fR seconds before registering DIMM labels. + Only meaninful if used together with --register-labels. + .TP + .BI "--layout +@@ -121,4 +121,4 @@ back to parsing output of the \fBdmidecode\fR(8) utility. Use of this + utility will most often require that \fBras-mc-ctl\fR be run as root. + + .SH SEE ALSO +-\f\fBrasdaemon\fR(1) ++\fBrasdaemon\fR(1) +-- +1.8.3.1 + diff --git a/SOURCES/0059-rasdaemon-Add-support-for-Knights-Landing-processor.patch b/SOURCES/0059-rasdaemon-Add-support-for-Knights-Landing-processor.patch new file mode 100644 index 0000000..a0fa572 --- /dev/null +++ b/SOURCES/0059-rasdaemon-Add-support-for-Knights-Landing-processor.patch @@ -0,0 +1,213 @@ +From 2d656c4ec9d5f68ac39b2a8461b0cd4f77dd7c21 Mon Sep 17 00:00:00 2001 +From: Marcin Koss <marcin.koss@intel.com> +Date: Thu, 3 Dec 2015 15:19:47 +0100 +Subject: [PATCH 3/5] rasdaemon: Add support for Knights Landing processor + +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + Makefile.am | 3 +- + mce-intel-knl.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + mce-intel.c | 5 +++ + ras-mce-handler.c | 1 + + ras-mce-handler.h | 1 + + 5 files changed, 137 insertions(+), 1 deletion(-) + create mode 100644 mce-intel-knl.c + +diff --git a/Makefile.am b/Makefile.am +index a6bf18f..a1cb02a 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -28,7 +28,8 @@ if WITH_MCE + rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \ + mce-intel-p4-p6.c mce-intel-nehalem.c \ + mce-intel-dunnington.c mce-intel-tulsa.c \ +- mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c ++ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ ++ mce-intel-knl.c + endif + if WITH_EXTLOG + rasdaemon_SOURCES += ras-extlog-handler.c +diff --git a/mce-intel-knl.c b/mce-intel-knl.c +new file mode 100644 +index 0000000..96b0a59 +--- /dev/null ++++ b/mce-intel-knl.c +@@ -0,0 +1,128 @@ ++/* ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++ ++#include <string.h> ++#include <stdio.h> ++ ++#include "ras-mce-handler.h" ++#include "bitfield.h" ++ ++static struct field memctrl_mc7[] = { ++ SBITFIELD(16, "CA Parity error"), ++ SBITFIELD(17, "Internal Parity error except WDB"), ++ SBITFIELD(18, "Internal Parity error from WDB"), ++ SBITFIELD(19, "Correctable Patrol Scrub"), ++ SBITFIELD(20, "Uncorrectable Patrol Scrub"), ++ SBITFIELD(21, "Spare Correctable Error"), ++ SBITFIELD(22, "Spare UC Error"), ++ SBITFIELD(23, "CORR Chip fail even MC only, 4 bit burst error EDC only"), ++ {} ++}; ++ ++void knl_decode_model(struct ras_events *ras, struct mce_event *e) ++{ ++ uint64_t status = e->status; ++ uint32_t mca = status & 0xffff; ++ unsigned rank0 = -1, rank1 = -1, chan = 0; ++ ++ switch (e->bank) { ++ case 5: ++ switch (EXTRACT(status, 0, 15)) { ++ case 0x402: ++ mce_snprintf(e->mcastatus_msg, "PCU Internal Errors"); ++ break; ++ case 0x403: ++ mce_snprintf(e->mcastatus_msg, "VCU Internal Errors"); ++ break; ++ case 0x407: ++ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal Errors"); ++ break; ++ } ++ break; ++ case 7: case 8: case 9: case 10: ++ case 11: case 12: case 13: case 14: ++ case 15: case 16: ++ if ((EXTRACT(status, 0, 15)) == 0x5) { ++ mce_snprintf(e->mcastatus_msg, "Internal Parity error"); ++ } else { ++ chan = (EXTRACT(status, 0, 3)) + 3 * (e->bank == 15); ++ switch (EXTRACT(status, 4, 7)) { ++ case 0x0: ++ mce_snprintf(e->mcastatus_msg, "Undefined request on channel %d", chan); ++ break; ++ case 0x1: ++ mce_snprintf(e->mcastatus_msg, "Read on channel %d", chan); ++ break; ++ case 0x2: ++ mce_snprintf(e->mcastatus_msg, "Write on channel %d", chan); ++ break; ++ case 0x3: ++ mce_snprintf(e->mcastatus_msg, "CA error on channel %d", chan); ++ break; ++ case 0x4: ++ mce_snprintf(e->mcastatus_msg, "Scrub error on channel %d", chan); ++ break; ++ } ++ } ++ decode_bitfield(e, status, memctrl_mc7); ++ break; ++ default: ++ break; ++ } ++ ++ /* ++ * Memory error specific code. Returns if the error is not a MC one ++ */ ++ ++ /* Check if the error is at the memory controller */ ++ if ((mca >> 7) != 1) ++ return; ++ ++ /* Ignore unless this is an corrected extended error from an iMC bank */ ++ if (e->bank < 7 || e->bank > 16 || (status & MCI_STATUS_UC) || ++ !test_prefix(7, status & 0xefff)) ++ return; ++ ++ /* ++ * Parse the reported channel and ranks ++ */ ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ { ++ mce_snprintf(e->mc_location, "memory_channel=unspecified"); ++ } ++ else ++ { ++ chan = chan + 3 * (e->bank == 15); ++ mce_snprintf(e->mc_location, "memory_channel=%d", chan); ++ ++ if (EXTRACT(e->misc, 62, 62)) ++ rank0 = EXTRACT(e->misc, 46, 50); ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ ++ /* ++ * FIXME: The conversion from rank to dimm requires to parse the ++ * DMI tables and call failrank2dimm(). ++ */ ++ if (rank0 != -1 && rank1 != -1) ++ mce_snprintf(e->mc_location, "ranks=%d and %d", ++ rank0, rank1); ++ else if (rank0 != -1) ++ mce_snprintf(e->mc_location, "rank=%d", rank0); ++ } ++} +diff --git a/mce-intel.c b/mce-intel.c +index 77b929b..032f4e0 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -397,6 +397,10 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) + break; + case CPU_HASWELL_EPEX: + hsw_decode_model(ras, e); ++ break; ++ case CPU_KNIGHTS_LANDING: ++ knl_decode_model(ras, e); ++ break; + default: + break; + } +@@ -460,6 +464,7 @@ int set_intel_imc_log(enum cputype cputype, unsigned ncpus) + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + case CPU_HASWELL_EPEX: ++ case CPU_KNIGHTS_LANDING: + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 23f2488..3b0b05b 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -223,6 +223,7 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus) + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + case CPU_HASWELL_EPEX: ++ case CPU_KNIGHTS_LANDING: + set_intel_imc_log(mce->cputype, ncpus); + default: + break; +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 13b8f52..5466743 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -119,6 +119,7 @@ void dunnington_decode_model(struct mce_event *e); + void snb_decode_model(struct ras_events *ras, struct mce_event *e); + void ivb_decode_model(struct ras_events *ras, struct mce_event *e); + void hsw_decode_model(struct ras_events *ras, struct mce_event *e); ++void knl_decode_model(struct ras_events *ras, struct mce_event *e); + void tulsa_decode_model(struct mce_event *e); + + /* Software defined banks */ +-- +1.8.3.1 + diff --git a/SOURCES/0060-mce-intel-knl-Fix-CodingStyle.patch b/SOURCES/0060-mce-intel-knl-Fix-CodingStyle.patch new file mode 100644 index 0000000..3f2da38 --- /dev/null +++ b/SOURCES/0060-mce-intel-knl-Fix-CodingStyle.patch @@ -0,0 +1,106 @@ +From 17f4e17d9870fbd35572ae6bf6c227c787b07fe9 Mon Sep 17 00:00:00 2001 +From: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +Date: Fri, 5 Feb 2016 15:15:18 -0200 +Subject: [PATCH 4/5] mce-intel-knl: Fix CodingStyle + +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + mce-intel-knl.c | 43 +++++++++++++++++++++++++++---------------- + 1 file changed, 27 insertions(+), 16 deletions(-) + +diff --git a/mce-intel-knl.c b/mce-intel-knl.c +index 96b0a59..7062fbb 100644 +--- a/mce-intel-knl.c ++++ b/mce-intel-knl.c +@@ -48,32 +48,46 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e) + mce_snprintf(e->mcastatus_msg, "VCU Internal Errors"); + break; + case 0x407: +- mce_snprintf(e->mcastatus_msg, "Other UBOX Internal Errors"); ++ mce_snprintf(e->mcastatus_msg, ++ "Other UBOX Internal Errors"); + break; + } + break; +- case 7: case 8: case 9: case 10: +- case 11: case 12: case 13: case 14: +- case 15: case 16: ++ case 7: ++ case 8: ++ case 9: ++ case 10: ++ case 11: ++ case 12: ++ case 13: ++ case 14: ++ case 15: ++ case 16: + if ((EXTRACT(status, 0, 15)) == 0x5) { + mce_snprintf(e->mcastatus_msg, "Internal Parity error"); + } else { + chan = (EXTRACT(status, 0, 3)) + 3 * (e->bank == 15); + switch (EXTRACT(status, 4, 7)) { + case 0x0: +- mce_snprintf(e->mcastatus_msg, "Undefined request on channel %d", chan); ++ mce_snprintf(e->mcastatus_msg, ++ "Undefined request on channel %d", ++ chan); + break; + case 0x1: +- mce_snprintf(e->mcastatus_msg, "Read on channel %d", chan); ++ mce_snprintf(e->mcastatus_msg, ++ "Read on channel %d", chan); + break; + case 0x2: +- mce_snprintf(e->mcastatus_msg, "Write on channel %d", chan); ++ mce_snprintf(e->mcastatus_msg, ++ "Write on channel %d", chan); + break; + case 0x3: +- mce_snprintf(e->mcastatus_msg, "CA error on channel %d", chan); ++ mce_snprintf(e->mcastatus_msg, ++ "CA error on channel %d", chan); + break; + case 0x4: +- mce_snprintf(e->mcastatus_msg, "Scrub error on channel %d", chan); ++ mce_snprintf(e->mcastatus_msg, ++ "Scrub error on channel %d", chan); + break; + } + } +@@ -93,7 +107,7 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e) + + /* Ignore unless this is an corrected extended error from an iMC bank */ + if (e->bank < 7 || e->bank > 16 || (status & MCI_STATUS_UC) || +- !test_prefix(7, status & 0xefff)) ++ !test_prefix(7, status & 0xefff)) + return; + + /* +@@ -101,12 +115,9 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e) + */ + + chan = EXTRACT(status, 0, 3); +- if (chan == 0xf) +- { ++ if (chan == 0xf) { + mce_snprintf(e->mc_location, "memory_channel=unspecified"); +- } +- else +- { ++ } else { + chan = chan + 3 * (e->bank == 15); + mce_snprintf(e->mc_location, "memory_channel=%d", chan); + +@@ -121,7 +132,7 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e) + */ + if (rank0 != -1 && rank1 != -1) + mce_snprintf(e->mc_location, "ranks=%d and %d", +- rank0, rank1); ++ rank0, rank1); + else if (rank0 != -1) + mce_snprintf(e->mc_location, "rank=%d", rank0); + } +-- +1.8.3.1 + diff --git a/SOURCES/0061-Add-Broadwell-DE-MSCOD-values.patch b/SOURCES/0061-Add-Broadwell-DE-MSCOD-values.patch new file mode 100644 index 0000000..d32380c --- /dev/null +++ b/SOURCES/0061-Add-Broadwell-DE-MSCOD-values.patch @@ -0,0 +1,244 @@ +From e7b88730f8a753a50fa0b8d1f7027f79baa05ca4 Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Fri, 8 Apr 2016 15:07:18 -0400 +Subject: [PATCH 1/2] Add Broadwell DE MSCOD values + +Based on mcelog commit id 32252e9c37e97ea5083d90d2cf194bb85a4a0cda. + +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + Makefile.am | 2 +- + mce-intel-broadwell-de.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++ + mce-intel.c | 3 + + ras-mce-handler.c | 6 +- + ras-mce-handler.h | 2 + + 5 files changed, 156 insertions(+), 3 deletions(-) + create mode 100644 mce-intel-broadwell-de.c + +diff --git a/Makefile.am b/Makefile.am +index a1cb02a..a8477d3 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -29,7 +29,7 @@ if WITH_MCE + mce-intel-p4-p6.c mce-intel-nehalem.c \ + mce-intel-dunnington.c mce-intel-tulsa.c \ + mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ +- mce-intel-knl.c ++ mce-intel-knl.c mce-intel-broadwell-de.c + endif + if WITH_EXTLOG + rasdaemon_SOURCES += ras-extlog-handler.c +diff --git a/mce-intel-broadwell-de.c b/mce-intel-broadwell-de.c +new file mode 100644 +index 0000000..d52c82e +--- /dev/null ++++ b/mce-intel-broadwell-de.c +@@ -0,0 +1,146 @@ ++/* ++ * The code below came from Tony Luck's mcelog code, ++ * released under GNU Public General License, v.2 ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++ ++#include <string.h> ++#include <stdio.h> ++ ++#include "ras-mce-handler.h" ++#include "bitfield.h" ++ ++/* See IA32 SDM Vol3B Table 16-24 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT", ++ [0x13] = "MC_DMI_TRAINING_TIMEOUT", ++ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX", ++ [0x25] = "MC_SVID_COMMAN_TIMEOUT", ++ [0x26] = "MCA_PKGC_DIRECT_WAKE_RING_TIMEOUT", ++ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID", ++ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x44] = "MC_CRITICAL_VR_FAILED", ++ [0x46] = "MC_VID_RAMP_DOWN_FAILED", ++ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED", ++ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0", ++ [0x4F] = "MC_SVID_COMMAND_ERROR", ++ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT", ++ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT", ++ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED", ++ [0x58] = "MC_SVID_IMON_REQUEST_FAILED", ++ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED", ++ [0x62] = "MC_INVALID_PKGS_RSP_QPI", ++ [0x64] = "MC_INVALID_PKG_STATE_CONFIG", ++ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT", ++ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT" ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-18 */ ++ ++static struct field memctrl_mc9[] = { ++ SBITFIELD(16, "Address parity error"), ++ SBITFIELD(17, "HA Wrt buffer Data parity error"), ++ SBITFIELD(18, "HA Wrt byte enable parity error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Corrected memory read error"), ++ SBITFIELD(24, "iMC, WDB, parity errors"), ++ {} ++}; ++ ++void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e) ++{ ++ uint64_t status = e->status; ++ uint32_t mca = status & 0xffff; ++ unsigned rank0 = -1, rank1 = -1, chan; ++ ++ switch (e->bank) { ++ case 4: ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ mce_snprintf(e->mcastatus_msg, "Internal errors "); ++ break; ++ case 0x406: ++ mce_snprintf(e->mcastatus_msg, "Intel TXT errors "); ++ break; ++ case 0x407: ++ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal errors "); ++ break; ++ } ++ if (EXTRACT(status, 16, 19) & 3) ++ mce_snprintf(e->mcastatus_msg, "PCU internal error "); ++ if (EXTRACT(status, 20, 23) & 4) ++ mce_snprintf(e->mcastatus_msg, "Ubox error "); ++ decode_bitfield(e, status, pcu_mc4); ++ break; ++ case 9: case 10: ++ mce_snprintf(e->mcastatus_msg, "MemCtrl: "); ++ decode_bitfield(e, status, memctrl_mc9); ++ break; ++ } ++ ++ /* ++ * Memory error specific code. Returns if the error is not a MC one ++ */ ++ ++ /* Check if the error is at the memory controller */ ++ if ((mca >> 7) != 1) ++ return; ++ ++ /* Ignore unless this is an corrected extended error from an iMC bank */ ++ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || ++ !test_prefix(7, status & 0xefff)) ++ return; ++ ++ /* ++ * Parse the reported channel and ranks ++ */ ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ return; ++ ++ mce_snprintf(e->mc_location, "memory_channel=%d", chan); ++ ++ if (EXTRACT(e->misc, 62, 62)) { ++ rank0 = EXTRACT(e->misc, 46, 50); ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ } ++ ++ /* ++ * FIXME: The conversion from rank to dimm requires to parse the ++ * DMI tables and call failrank2dimm(). ++ */ ++ if (rank0 != -1 && rank1 != -1) ++ mce_snprintf(e->mc_location, "ranks=%d and %d", ++ rank0, rank1); ++ else if (rank0 != -1) ++ mce_snprintf(e->mc_location, "rank=%d", rank0); ++} +diff --git a/mce-intel.c b/mce-intel.c +index 032f4e0..b132903 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -401,6 +401,9 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) + case CPU_KNIGHTS_LANDING: + knl_decode_model(ras, e); + break; ++ case CPU_BROADWELL_DE: ++ broadwell_de_decode_model(ras, e); ++ break; + default: + break; + } +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 3b0b05b..b58d6e0 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -50,6 +50,7 @@ static char *cputype_name[] = { + [CPU_HASWELL] = "Haswell", + [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", + [CPU_BROADWELL] = "Broadwell", ++ [CPU_BROADWELL_DE] = "Broadwell DE", + [CPU_KNIGHTS_LANDING] = "Knights Landing", + }; + +@@ -90,8 +91,9 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_HASWELL; + else if (mce->model == 0x3f) + return CPU_HASWELL_EPEX; +- else if (mce->model == 0x3d || mce->model == 0x4f || +- mce->model == 0x56) ++ else if (mce->model == 0x56) ++ return CPU_BROADWELL_DE; ++ else if (mce->model == 0x3d || mce->model == 0x4f) + return CPU_BROADWELL; + else if (mce->model == 0x57) + return CPU_KNIGHTS_LANDING; +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 5466743..2648048 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -45,6 +45,7 @@ enum cputype { + CPU_HASWELL, + CPU_HASWELL_EPEX, + CPU_BROADWELL, ++ CPU_BROADWELL_DE, + CPU_KNIGHTS_LANDING, + }; + +@@ -121,6 +122,7 @@ void ivb_decode_model(struct ras_events *ras, struct mce_event *e); + void hsw_decode_model(struct ras_events *ras, struct mce_event *e); + void knl_decode_model(struct ras_events *ras, struct mce_event *e); + void tulsa_decode_model(struct mce_event *e); ++void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e); + + /* Software defined banks */ + #define MCE_EXTENDED_BANK 128 +-- +1.8.3.1 + diff --git a/SOURCES/0062-Add-Broadwell-EP-EX-MSCOD-values.patch b/SOURCES/0062-Add-Broadwell-EP-EX-MSCOD-values.patch new file mode 100644 index 0000000..23f8f81 --- /dev/null +++ b/SOURCES/0062-Add-Broadwell-EP-EX-MSCOD-values.patch @@ -0,0 +1,289 @@ +From 0dd44fca9d756990acf01cd2cdaa585f369168bc Mon Sep 17 00:00:00 2001 +From: Aristeu Rozanski <arozansk@redhat.com> +Date: Fri, 8 Apr 2016 15:07:19 -0400 +Subject: [PATCH 2/2] Add Broadwell EP/EX MSCOD values + +Based on mcelog commit id 32252e9c37e97ea5083d90d2cf194bb85a4a0cda. + +Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com> +--- + Makefile.am | 3 +- + mce-intel-broadwell-epex.c | 191 +++++++++++++++++++++++++++++++++++++++++++++ + mce-intel.c | 3 + + ras-mce-handler.c | 5 +- + ras-mce-handler.h | 2 + + 5 files changed, 202 insertions(+), 2 deletions(-) + create mode 100644 mce-intel-broadwell-epex.c + +diff --git a/Makefile.am b/Makefile.am +index a8477d3..c9e4481 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -29,7 +29,8 @@ if WITH_MCE + mce-intel-p4-p6.c mce-intel-nehalem.c \ + mce-intel-dunnington.c mce-intel-tulsa.c \ + mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ +- mce-intel-knl.c mce-intel-broadwell-de.c ++ mce-intel-knl.c mce-intel-broadwell-de.c \ ++ mce-intel-broadwell-epex.c + endif + if WITH_EXTLOG + rasdaemon_SOURCES += ras-extlog-handler.c +diff --git a/mce-intel-broadwell-epex.c b/mce-intel-broadwell-epex.c +new file mode 100644 +index 0000000..f7cd3b6 +--- /dev/null ++++ b/mce-intel-broadwell-epex.c +@@ -0,0 +1,191 @@ ++/* ++ * The code below came from Tony Luck's mcelog code, ++ * released under GNU Public General License, v.2 ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++ ++#include <string.h> ++#include <stdio.h> ++ ++#include "ras-mce-handler.h" ++#include "bitfield.h" ++ ++/* See IA32 SDM Vol3B Table 16-20 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT", ++ [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT", ++ [0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT", ++ [0x13] = "MC_DMI_TRAINING_TIMEOUT", ++ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX", ++ [0x25] = "MC_SVID_COMMAN_TIMEOUT", ++ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID", ++ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF", ++ [0x44] = "MC_CRITICAL_VR_FAILED", ++ [0x45] = "MC_ICC_MAX_NOTSUPPORTED", ++ [0x46] = "MC_VID_RAMP_DOWN_FAILED", ++ [0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP", ++ [0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED", ++ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED", ++ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0", ++ [0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1", ++ [0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2", ++ [0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3", ++ [0x4F] = "MC_SVID_COMMAND_ERROR", ++ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT", ++ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT", ++ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED", ++ [0x58] = "MC_SVID_IMON_REQUEST_FAILED", ++ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED", ++ [0x60] = "MC_INVALID_PKGS_REQ_PCH", ++ [0x61] = "MC_INVALID_PKGS_REQ_QPI", ++ [0x62] = "MC_INVALID_PKGS_RSP_QPI", ++ [0x63] = "MC_INVALID_PKGS_RSP_PCH", ++ [0x64] = "MC_INVALID_PKG_STATE_CONFIG", ++ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT", ++ [0x68] = "MC_IMC_RW_SMBUS_TIMEOUT", ++ [0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED", ++ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE", ++ [0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER", ++ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ", ++ [0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT", ++ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT" ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-21 */ ++ ++static char *qpi[] = { ++ [0x02] = "Intel QPI physical layer detected drift buffer alarm", ++ [0x03] = "Intel QPI physical layer detected latency buffer rollover", ++ [0x10] = "Intel QPI link layer detected control error from R3QPI", ++ [0x11] = "Rx entered LLR abort state on CRC error", ++ [0x12] = "Unsupported or undefined packet", ++ [0x13] = "Intel QPI link layer control error", ++ [0x15] = "RBT used un-initialized value", ++ [0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization", ++ [0x21] = "Link failover data self healing", ++ [0x22] = "Phy detected in-band reset (no width change)", ++ [0x23] = "Link failover clock failover", ++ [0x30] = "Rx detected CRC error - successful LLR after Phy re-init", ++ [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init", ++}; ++ ++static struct field qpi_mc[] = { ++ FIELD(16, qpi), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-26 */ ++ ++static struct field memctrl_mc9[] = { ++ SBITFIELD(16, "DDR3 address parity error"), ++ SBITFIELD(17, "Uncorrected HA write data error"), ++ SBITFIELD(18, "Uncorrected HA data byte enable error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(24, "iMC write data buffer parity error"), ++ SBITFIELD(25, "DDR4 command address parity error"), ++ {} ++}; ++ ++void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e) ++{ ++ uint64_t status = e->status; ++ uint32_t mca = status & 0xffff; ++ unsigned rank0 = -1, rank1 = -1, chan; ++ ++ switch (e->bank) { ++ case 4: ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ mce_snprintf(e->mcastatus_msg, "Internal errors "); ++ break; ++ case 0x406: ++ mce_snprintf(e->mcastatus_msg, "Intel TXT errors "); ++ break; ++ case 0x407: ++ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal errors "); ++ break; ++ } ++ if (EXTRACT(status, 16, 19)) ++ mce_snprintf(e->mcastatus_msg, "PCU internal error "); ++ decode_bitfield(e, status, pcu_mc4); ++ break; ++ case 5: ++ case 20: ++ case 21: ++ mce_snprintf(e->mcastatus_msg, "QPI: "); ++ decode_bitfield(e, status, qpi_mc); ++ break; ++ case 9: case 10: case 11: case 12: ++ case 13: case 14: case 15: case 16: ++ mce_snprintf(e->mcastatus_msg, "MemCtrl: "); ++ decode_bitfield(e, status, memctrl_mc9); ++ break; ++ } ++ ++ /* ++ * Memory error specific code. Returns if the error is not a MC one ++ */ ++ ++ /* Check if the error is at the memory controller */ ++ if ((mca >> 7) != 1) ++ return; ++ ++ /* Ignore unless this is an corrected extended error from an iMC bank */ ++ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || ++ !test_prefix(7, status & 0xefff)) ++ return; ++ ++ /* ++ * Parse the reported channel and ranks ++ */ ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ return; ++ ++ mce_snprintf(e->mc_location, "memory_channel=%d", chan); ++ ++ if (EXTRACT(e->misc, 62, 62)) { ++ rank0 = EXTRACT(e->misc, 46, 50); ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ } ++ ++ /* ++ * FIXME: The conversion from rank to dimm requires to parse the ++ * DMI tables and call failrank2dimm(). ++ */ ++ if (rank0 != -1 && rank1 != -1) ++ mce_snprintf(e->mc_location, "ranks=%d and %d", ++ rank0, rank1); ++ else if (rank0 != -1) ++ mce_snprintf(e->mc_location, "rank=%d", rank0); ++} +diff --git a/mce-intel.c b/mce-intel.c +index b132903..bf68d9b 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -404,6 +404,9 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e) + case CPU_BROADWELL_DE: + broadwell_de_decode_model(ras, e); + break; ++ case CPU_BROADWELL_EPEX: ++ broadwell_epex_decode_model(ras, e); ++ break; + default: + break; + } +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index b58d6e0..b875512 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -51,6 +51,7 @@ static char *cputype_name[] = { + [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX", + [CPU_BROADWELL] = "Broadwell", + [CPU_BROADWELL_DE] = "Broadwell DE", ++ [CPU_BROADWELL_EPEX] = "Broadwell EP/EX", + [CPU_KNIGHTS_LANDING] = "Knights Landing", + }; + +@@ -93,7 +94,9 @@ static enum cputype select_intel_cputype(struct ras_events *ras) + return CPU_HASWELL_EPEX; + else if (mce->model == 0x56) + return CPU_BROADWELL_DE; +- else if (mce->model == 0x3d || mce->model == 0x4f) ++ else if (mce->model == 0x4f) ++ return CPU_BROADWELL_EPEX; ++ else if (mce->model == 0x3d) + return CPU_BROADWELL; + else if (mce->model == 0x57) + return CPU_KNIGHTS_LANDING; +diff --git a/ras-mce-handler.h b/ras-mce-handler.h +index 2648048..c5a3717 100644 +--- a/ras-mce-handler.h ++++ b/ras-mce-handler.h +@@ -46,6 +46,7 @@ enum cputype { + CPU_HASWELL_EPEX, + CPU_BROADWELL, + CPU_BROADWELL_DE, ++ CPU_BROADWELL_EPEX, + CPU_KNIGHTS_LANDING, + }; + +@@ -123,6 +124,7 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e); + void knl_decode_model(struct ras_events *ras, struct mce_event *e); + void tulsa_decode_model(struct mce_event *e); + void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e); ++void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e); + + /* Software defined banks */ + #define MCE_EXTENDED_BANK 128 +-- +1.8.3.1 + diff --git a/SOURCES/0063-add_support_for_knights_mill.patch b/SOURCES/0063-add_support_for_knights_mill.patch new file mode 100644 index 0000000..5362ea0 --- /dev/null +++ b/SOURCES/0063-add_support_for_knights_mill.patch @@ -0,0 +1,63 @@ +--- + mce-intel.c | 3 +++ + ras-mce-handler.c | 5 +++++ + ras-mce-handler.h | 1 + + 3 files changed, 9 insertions(+) + +--- rasdaemon-0.4.1.orig/mce-intel.c 2017-05-30 12:04:54.440167730 -0400 ++++ rasdaemon-0.4.1/mce-intel.c 2017-05-30 12:06:51.705755469 -0400 +@@ -399,6 +399,7 @@ if (test_prefix(11, (e->status & 0xffffL + hsw_decode_model(ras, e); + break; + case CPU_KNIGHTS_LANDING: ++ case CPU_KNIGHTS_MILL: + knl_decode_model(ras, e); + break; + case CPU_BROADWELL_DE: +@@ -470,6 +471,8 @@ int set_intel_imc_log(enum cputype cputy + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + case CPU_HASWELL_EPEX: ++ case CPU_KNIGHTS_LANDING: ++ case CPU_KNIGHTS_MILL: + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; +--- rasdaemon-0.4.1.orig/ras-mce-handler.c 2017-05-30 12:04:54.440167730 -0400 ++++ rasdaemon-0.4.1/ras-mce-handler.c 2017-05-30 12:07:59.850934779 -0400 +@@ -53,6 +53,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series + [CPU_BROADWELL_DE] = "Broadwell DE", + [CPU_BROADWELL_EPEX] = "Broadwell EP/EX", + [CPU_KNIGHTS_LANDING] = "Knights Landing", ++ [CPU_KNIGHTS_MILL] = "Knights Mill", + }; + + static enum cputype select_intel_cputype(struct ras_events *ras) +@@ -100,6 +101,8 @@ else if (mce->model == 0x3d) + return CPU_BROADWELL; + else if (mce->model == 0x57) + return CPU_KNIGHTS_LANDING; ++ else if (mce->model == 0x85) ++ return CPU_KNIGHTS_MILL; + + if (mce->model > 0x1a) { + log(ALL, LOG_INFO, +@@ -228,6 +231,8 @@ int register_mce_handler(struct ras_even + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + case CPU_HASWELL_EPEX: ++ case CPU_KNIGHTS_LANDING: ++ case CPU_KNIGHTS_MILL: + set_intel_imc_log(mce->cputype, ncpus); + default: + break; +--- rasdaemon-0.4.1.orig/ras-mce-handler.h 2017-05-30 12:04:54.440167730 -0400 ++++ rasdaemon-0.4.1/ras-mce-handler.h 2017-05-30 12:04:58.976113103 -0400 +@@ -48,6 +48,7 @@ enum cputype { + CPU_BROADWELL_DE, + CPU_BROADWELL_EPEX, + CPU_KNIGHTS_LANDING, ++ CPU_KNIGHTS_MILL, + }; + + struct mce_event { diff --git a/SOURCES/0064-add_support_for_skylake.patch b/SOURCES/0064-add_support_for_skylake.patch new file mode 100644 index 0000000..d6666c9 --- /dev/null +++ b/SOURCES/0064-add_support_for_skylake.patch @@ -0,0 +1,344 @@ +commit f9a5724021d8bc9f38cee3a0a71eb4032da1ec66 +Author: Aristeu Rozanski <arozansk@redhat.com> +Date: Mon Sep 19 15:28:33 2016 -0400 + + rasdaemon: add support for Skylake client and server + + Base on upstream mcelog commits + 6c07f906dadfe2c4bb7a21e5fc60dc2f34056bf0 + e4aca6312aee03066ab45632a7bee23dc892a425 + + Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> + +--- + Makefile.am | 2 + mce-intel-skx.c | 257 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + mce-intel.c | 3 + ras-mce-handler.c | 6 + + ras-mce-handler.h | 3 + 5 files changed, 270 insertions(+), 1 deletion(-) + +--- rasdaemon-0.4.1.orig/Makefile.am 2017-05-30 12:43:11.975591485 -0400 ++++ rasdaemon-0.4.1/Makefile.am 2017-05-30 12:43:16.948531592 -0400 +@@ -30,7 +30,7 @@ if WITH_MCE + mce-intel-dunnington.c mce-intel-tulsa.c \ + mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ + mce-intel-knl.c mce-intel-broadwell-de.c \ +- mce-intel-broadwell-epex.c ++ mce-intel-broadwell-epex.c mce-intel-skx.c + endif + if WITH_EXTLOG + rasdaemon_SOURCES += ras-extlog-handler.c +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ rasdaemon-0.4.1/mce-intel-skx.c 2017-05-30 12:43:16.948531592 -0400 +@@ -0,0 +1,257 @@ ++/* ++ * The code below came from Tony Luck mcelog code, ++ * released under GNU Public General License, v.2 ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++*/ ++ ++#include <string.h> ++#include <stdio.h> ++ ++#include "ras-mce-handler.h" ++#include "bitfield.h" ++ ++ ++/* See IA32 SDM Vol3B Table 16-27 */ ++ ++static char *pcu_1[] = { ++ [0x00] = "No Error", ++ [0x0d] = "MCA_DMI_TRAINING_TIMEOUT", ++ [0x0f] = "MCA_DMI_CPU_RESET_ACK_TIMEOUT", ++ [0x10] = "MCA_MORE_THAN_ONE_LT_AGENT", ++ [0x1e] = "MCA_BIOS_RST_CPL_INVALID_SEQ", ++ [0x1f] = "MCA_BIOS_INVALID_PKG_STATE_CONFIG", ++ [0x25] = "MCA_MESSAGE_CHANNEL_TIMEOUT", ++ [0x27] = "MCA_MSGCH_PMREQ_CMP_TIMEOUT", ++ [0x30] = "MCA_PKGC_DIRECT_WAKE_RING_TIMEOUT", ++ [0x31] = "MCA_PKGC_INVALID_RSP_PCH", ++ [0x33] = "MCA_PKGC_WATCHDOG_HANG_CBZ_DOWN", ++ [0x34] = "MCA_PKGC_WATCHDOG_HANG_CBZ_UP", ++ [0x38] = "MCA_PKGC_WATCHDOG_HANG_C3_UP_SF", ++ [0x40] = "MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE", ++ [0x41] = "MCA_SVID_COMMAND_TIMEOUT", ++ [0x42] = "MCA_SVID_VCCIN_VR_VOUT_FAILURE", ++ [0x43] = "MCA_SVID_CPU_VR_CAPABILITY_ERROR", ++ [0x44] = "MCA_SVID_CRITICAL_VR_FAILED", ++ [0x45] = "MCA_SVID_SA_ITD_ERROR", ++ [0x46] = "MCA_SVID_READ_REG_FAILED", ++ [0x47] = "MCA_SVID_WRITE_REG_FAILED", ++ [0x48] = "MCA_SVID_PKGC_INIT_FAILED", ++ [0x49] = "MCA_SVID_PKGC_CONFIG_FAILED", ++ [0x4a] = "MCA_SVID_PKGC_REQUEST_FAILED", ++ [0x4b] = "MCA_SVID_IMON_REQUEST_FAILED", ++ [0x4c] = "MCA_SVID_ALERT_REQUEST_FAILED", ++ [0x4d] = "MCA_SVID_MCP_VR_ABSENT_OR_RAMP_ERROR", ++ [0x4e] = "MCA_SVID_UNEXPECTED_MCP_VR_DETECTED", ++ [0x51] = "MCA_FIVR_CATAS_OVERVOL_FAULT", ++ [0x52] = "MCA_FIVR_CATAS_OVERCUR_FAULT", ++ [0x58] = "MCA_WATCHDOG_TIMEOUT_PKGC_SLAVE", ++ [0x59] = "MCA_WATCHDOG_TIMEOUT_PKGC_MASTER", ++ [0x5a] = "MCA_WATCHDOG_TIMEOUT_PKGS_MASTER", ++ [0x61] = "MCA_PKGS_CPD_UNCPD_TIMEOUT", ++ [0x63] = "MCA_PKGS_INVALID_REQ_PCH", ++ [0x64] = "MCA_PKGS_INVALID_REQ_INTERNAL", ++ [0x65] = "MCA_PKGS_INVALID_RSP_INTERNAL", ++ [0x6b] = "MCA_PKGS_SMBUS_VPP_PAUSE_TIMEOUT", ++ [0x81] = "MCA_RECOVERABLE_DIE_THERMAL_TOO_HOT", ++}; ++ ++static struct field pcu_mc4[] = { ++ FIELD(24, pcu_1), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-28 */ ++ ++static char *qpi[] = { ++ [0x00] = "UC Phy Initialization Failure", ++ [0x01] = "UC Phy detected drift buffer alarm", ++ [0x02] = "UC Phy detected latency buffer rollover", ++ [0x10] = "UC LL Rx detected CRC error: unsuccessful LLR: entered abort state", ++ [0x11] = "UC LL Rx unsupported or undefined packet", ++ [0x12] = "UC LL or Phy control error", ++ [0x13] = "UC LL Rx parameter exchange exception", ++ [0x1F] = "UC LL detected control error from the link-mesh interface", ++ [0x20] = "COR Phy initialization abort", ++ [0x21] = "COR Phy reset", ++ [0x22] = "COR Phy lane failure, recovery in x8 width", ++ [0x23] = "COR Phy L0c error corrected without Phy reset", ++ [0x24] = "COR Phy L0c error triggering Phy Reset", ++ [0x25] = "COR Phy L0p exit error corrected with Phy reset", ++ [0x30] = "COR LL Rx detected CRC error - successful LLR without Phy Reinit", ++ [0x31] = "COR LL Rx detected CRC error - successful LLR with Phy Reinit", ++}; ++ ++static struct field qpi_mc[] = { ++ FIELD(16, qpi), ++ {} ++}; ++ ++/* These apply to MSCOD 0x12 "UC LL or Phy control error" */ ++static struct field qpi_0x12[] = { ++ SBITFIELD(22, "Phy Control Error"), ++ SBITFIELD(23, "Unexpected Retry.Ack flit"), ++ SBITFIELD(24, "Unexpected Retry.Req flit"), ++ SBITFIELD(25, "RF parity error"), ++ SBITFIELD(26, "Routeback Table error"), ++ SBITFIELD(27, "unexpected Tx Protocol flit (EOP, Header or Data)"), ++ SBITFIELD(28, "Rx Header-or-Credit BGF credit overflow/underflow"), ++ SBITFIELD(29, "Link Layer Reset still in progress when Phy enters L0"), ++ SBITFIELD(30, "Link Layer reset initiated while protocol traffic not idle"), ++ SBITFIELD(31, "Link Layer Tx Parity Error"), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-29 */ ++ ++static struct field mc_bits[] = { ++ SBITFIELD(16, "Address parity error"), ++ SBITFIELD(17, "HA write data parity error"), ++ SBITFIELD(18, "HA write byte enable parity error"), ++ SBITFIELD(19, "Corrected patrol scrub error"), ++ SBITFIELD(20, "Uncorrected patrol scrub error"), ++ SBITFIELD(21, "Corrected spare error"), ++ SBITFIELD(22, "Uncorrected spare error"), ++ SBITFIELD(23, "Any HA read error"), ++ SBITFIELD(24, "WDB read parity error"), ++ SBITFIELD(25, "DDR4 command address parity error"), ++ SBITFIELD(26, "Uncorrected address parity error"), ++ {} ++}; ++ ++static char *mc_0x8xx[] = { ++ [0x0] = "Unrecognized request type", ++ [0x1] = "Read response to an invalid scoreboard entry", ++ [0x2] = "Unexpected read response", ++ [0x3] = "DDR4 completion to an invalid scoreboard entry", ++ [0x4] = "Completion to an invalid scoreboard entry", ++ [0x5] = "Completion FIFO overflow", ++ [0x6] = "Correctable parity error", ++ [0x7] = "Uncorrectable error", ++ [0x8] = "Interrupt received while outstanding interrupt was not ACKed", ++ [0x9] = "ERID FIFO overflow", ++ [0xa] = "Error on Write credits", ++ [0xb] = "Error on Read credits", ++ [0xc] = "Scheduler error", ++ [0xd] = "Error event", ++}; ++ ++static struct field memctrl_mc13[] = { ++ FIELD(16, mc_0x8xx), ++ {} ++}; ++ ++/* See IA32 SDM Vol3B Table 16-30 */ ++ ++static struct field m2m[] = { ++ SBITFIELD(16, "MscodDataRdErr"), ++ SBITFIELD(17, "Reserved"), ++ SBITFIELD(18, "MscodPtlWrErr"), ++ SBITFIELD(19, "MscodFullWrErr"), ++ SBITFIELD(20, "MscodBgfErr"), ++ SBITFIELD(21, "MscodTimeout"), ++ SBITFIELD(22, "MscodParErr"), ++ SBITFIELD(23, "MscodBucket1Err"), ++ {} ++}; ++ ++void skylake_xeon_decode_model(struct ras_events *ras, struct mce_event *e) ++{ ++ uint64_t status = e->status; ++ uint32_t mca = status & 0xffff; ++ unsigned rank0 = -1, rank1 = -1, chan; ++ ++ switch (e->bank) { ++ case 4: ++ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) { ++ case 0x402: case 0x403: ++ mce_snprintf(e->mcastatus_msg, "Internal errors "); ++ break; ++ case 0x406: ++ mce_snprintf(e->mcastatus_msg, "Intel TXT errors "); ++ break; ++ case 0x407: ++ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal errors "); ++ break; ++ } ++ if (EXTRACT(status, 16, 19)) ++ mce_snprintf(e->mcastatus_msg, "PCU internal error "); ++ decode_bitfield(e, status, pcu_mc4); ++ break; ++ case 5: ++ case 12: ++ case 19: ++ mce_snprintf(e->mcastatus_msg, "QPI: "); ++ decode_bitfield(e, status, qpi_mc); ++ if ((EXTRACT(status, 16, 21) == 0x12)) ++ decode_bitfield(e, status, qpi_0x12); ++ break; ++ case 7: ++ case 8: ++ mce_snprintf(e->mcastatus_msg, "M2M: "); ++ decode_bitfield(e, status, m2m); ++ break; ++ case 13: ++ case 14: ++ case 15: ++ case 16: ++ mce_snprintf(e->mcastatus_msg, "MemCtrl: "); ++ if (EXTRACT(status, 27, 27)) ++ decode_bitfield(e, status, memctrl_mc13); ++ else ++ decode_bitfield(e, status, mc_bits); ++ break; ++ } ++ ++ /* ++ * Memory error specific code. Returns if the error is not a MC one ++ */ ++ ++ /* Check if the error is at the memory controller */ ++ if ((mca >> 7) != 1) ++ return; ++ ++ /* Ignore unless this is an corrected extended error from an iMC bank */ ++ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) || ++ !test_prefix(7, status & 0xefff)) ++ return; ++ ++ /* ++ * Parse the reported channel and ranks ++ */ ++ ++ chan = EXTRACT(status, 0, 3); ++ if (chan == 0xf) ++ return; ++ ++ mce_snprintf(e->mc_location, "memory_channel=%d", chan); ++ ++ if (EXTRACT(e->misc, 62, 62)) { ++ rank0 = EXTRACT(e->misc, 46, 50); ++ if (EXTRACT(e->misc, 63, 63)) ++ rank1 = EXTRACT(e->misc, 51, 55); ++ } ++ ++ /* ++ * FIXME: The conversion from rank to dimm requires to parse the ++ * DMI tables and call failrank2dimm(). ++ */ ++ if (rank0 != -1 && rank1 != -1) ++ mce_snprintf(e->mc_location, "ranks=%d and %d", ++ rank0, rank1); ++ else if (rank0 != -1) ++ mce_snprintf(e->mc_location, "rank=%d", rank0); ++} ++ +--- rasdaemon-0.4.1.orig/mce-intel.c 2017-05-30 12:43:11.975591485 -0400 ++++ rasdaemon-0.4.1/mce-intel.c 2017-05-30 12:43:16.948531592 -0400 +@@ -408,6 +408,9 @@ if (test_prefix(11, (e->status & 0xffffL + case CPU_BROADWELL_EPEX: + broadwell_epex_decode_model(ras, e); + break; ++ case CPU_SKYLAKE_XEON: ++ skylake_xeon_decode_model(ras, e); ++ break; + default: + break; + } +--- rasdaemon-0.4.1.orig/ras-mce-handler.c 2017-05-30 12:43:16.948531592 -0400 ++++ rasdaemon-0.4.1/ras-mce-handler.c 2017-05-30 12:44:00.295009527 -0400 +@@ -54,6 +54,8 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series + [CPU_BROADWELL_EPEX] = "Broadwell EP/EX", + [CPU_KNIGHTS_LANDING] = "Knights Landing", + [CPU_KNIGHTS_MILL] = "Knights Mill", ++ [CPU_SKYLAKE] = "Skylake", ++ [CPU_SKYLAKE_XEON] = "Skylake Xeon", + }; + + static enum cputype select_intel_cputype(struct ras_events *ras) +@@ -103,6 +105,10 @@ else if (mce->model == 0x57) + return CPU_KNIGHTS_LANDING; + else if (mce->model == 0x85) + return CPU_KNIGHTS_MILL; ++ else if (mce->model == 0x4e || mce->model == 0x5e) ++ return CPU_SKYLAKE; ++ else if (mce->model == 0x55) ++ return CPU_SKYLAKE_XEON; + + if (mce->model > 0x1a) { + log(ALL, LOG_INFO, +--- rasdaemon-0.4.1.orig/ras-mce-handler.h 2017-05-30 12:43:11.976591473 -0400 ++++ rasdaemon-0.4.1/ras-mce-handler.h 2017-05-30 12:44:25.745703000 -0400 +@@ -49,6 +49,8 @@ enum cputype { + CPU_BROADWELL_EPEX, + CPU_KNIGHTS_LANDING, + CPU_KNIGHTS_MILL, ++ CPU_SKYLAKE, ++ CPU_SKYLAKE_XEON, + }; + + struct mce_event { +@@ -126,6 +128,7 @@ void knl_decode_model(struct ras_events + void tulsa_decode_model(struct mce_event *e); + void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e); + void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e); ++void skylake_xeon_decode_model(struct ras_events *ras, struct mce_event *e); + + /* Software defined banks */ + #define MCE_EXTENDED_BANK 128 diff --git a/SOURCES/0065-rasdaemon-Update-DIMM-labels-for-Dell-Servers.patch b/SOURCES/0065-rasdaemon-Update-DIMM-labels-for-Dell-Servers.patch new file mode 100644 index 0000000..4c08707 --- /dev/null +++ b/SOURCES/0065-rasdaemon-Update-DIMM-labels-for-Dell-Servers.patch @@ -0,0 +1,142 @@ +--- + labels/dell | 96 +++++++++++++++++++++++++++++++++++------------------------- + 1 file changed, 56 insertions(+), 40 deletions(-) + +--- rasdaemon-0.4.1.orig/labels/dell 2017-08-23 16:14:36.086652150 -0400 ++++ rasdaemon-0.4.1/labels/dell 2017-08-23 16:16:59.091057241 -0400 +@@ -4,23 +4,35 @@ + # labels are found from the silk screen on the motherboard. + # + #Vendor: <vendor-name> ++# Product: <product-name> + # Model: <model-name> + # <label>: <mc>.<top>.<mid>.<low> + # + + Vendor: Dell Inc. +-#### 11G #### ++# 1-socket ++ Product: PowerEdge R220, PowerEdge R330, PowerEdge T330, PowerEdge R230, PowerEdge T130, PowerEdge T30 ++ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; ++ DIMM_A3: 0.1.0; DIMM_A4: 0.1.1; ++ ++ Product: PowerEdge T110 II, PowerEdge T20 ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; ++ ++ DIMM_B1: 0.0.1; DIMM_B2: 0.1.1; ++ ++ Product: PowerEdge R320, PowerEdge T320 ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; ++ DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1; ++ + # 2-socket +-# PowerEdge R610 +- Model: 0K399H, 0F0XJ6 ++ Product: PowerEdge R610 + DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2; + DIMM_A4: 0.1.0; DIMM_A5: 0.1.1; DIMM_A6: 0.1.2; + + DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2; + DIMM_B4: 1.1.0; DIMM_B5: 1.1.1; DIMM_B6: 1.1.2; + +-# PowerEdge T710 R710 +- Model: 01CTXG, 0N0H4P, 0MD99X, 0N047H, 0PV9DG ++ Product: PowerEdge T710, PowerEdge R710 + DIMM_A3: 0.0.0; DIMM_A2: 0.1.0; DIMM_A1: 0.2.0; + DIMM_A6: 0.0.1; DIMM_A5: 0.1.1; DIMM_A4: 0.2.1; + DIMM_A9: 0.0.2; DIMM_A8: 0.1.2; DIMM_A7: 0.2.2; +@@ -29,27 +41,7 @@ DIMM_B3: 1.0.0; DIMM_B2: 1.1.0; DIMM_B1 + DIMM_B6: 1.0.1; DIMM_B5: 1.1.1; DIMM_B4: 1.2.1; + DIMM_B9: 1.0.2; DIMM_B8: 1.1.2; DIMM_B7: 1.2.2; + +-#### 12/13G #### +-# 1-socket +-# PowerEdge R220 +- Model: 081N4V +- DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; +- DIMM_A3: 0.1.0; DIMM_A4: 0.1.1; +- +-#PowerEdge T110 II, T20 +- Model: 0PC2WT, 0PM2CW, 015TH9, 0MDHN4, 0VD5HY +- DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; +- +- DIMM_B1: 0.0.1; DIMM_B2: 0.1.1; +- +-#PowerEdge R320 T320 +- Model: 0YCV59, 0Y97HY, 07DKYR, 0VJ84C, 07MYHN, 04DMNN, 0W7H8C, 0K20G5, 0V719V, 0FDT3J +- DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; +- DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1; +- +-# 2-socket +-# PowerEdge R620/T620 R720/xd R730/xd T630 R730 R630 T620 M620, FC620 +- Model: 0VWT90, 07NDJ2, 0F5XM3, 0PXXHP, 0X3D66, 061P35, 0H5J4J, 00W9X3, 0599V5, 0W9WXC, 0599V5, 0H21J3, 0CNCJW, 02CD1V, 0T5TFW, 0F5XM3, 0G1CNH, 05YV77, 0PDCCX, 093MW8, 0NJVT7 ++ Product: PowerEdge R620, PowerEdge T620, PowerEdge R720xd, PowerEdge R730xd, PowerEdge T630, PowerEdge R730, PowerEdge R630, PowerEdge T620, PowerEdge M620, PowerEdge FC620, PowerEdge M630, PowerEdge FC630 + DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0; + DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1; + DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2; +@@ -58,23 +50,38 @@ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_ + DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1; + DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2; + +-# PowerEdge M520 R420 T420 +- Model: 0NRG83, 0DW6GX, 03WPHJ, 06HTRX, 0H1Y24, 02T9N6, 0TT5P2, 0CPKXG, 03015M, 061VPC, 0PC9H0, 0K3G34, 0PC0V5, 08NVYK +- DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; +- DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1; +- +- DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; +- DIMM_B4: 1.0.1; DIMM_B5: 1.1.1; DIMM_B6: 1.2.1; +- +-#PowerEdge FC420, M420 +- Model: 0DPJGD, 068CTP, 0MN3VC, 0417VP +- DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; ++ Product: PowerEdge M520, PowerEdge R420, PowerEdge T420 ++ DIMM_A1: 0.1.0; DIMM_A2: 0.2.0; DIMM_A3: 0.3.0; ++ DIMM_A4: 0.1.1; DIMM_A5: 0.2.1; DIMM_A6: 0.3.1; ++ ++ DIMM_B1: 1.1.0; DIMM_B2: 1.2.0; DIMM_B3: 1.3.0; ++ DIMM_B4: 1.1.1; DIMM_B5: 1.2.1; DIMM_B6: 1.3.1; + +- DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; ++ Product: PowerEdge FC420, PowerEdge M420 ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; ++ ++ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; ++ ++ Product: PowerEdge C6320, PowerEdge C4130 ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0; ++ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1; ++ ++ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0; ++ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1; ++ ++ Product: PowerEdge R430, PowerEdge T430, PowerEdge R530 ++ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0; ++ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1; ++ ++ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0; ++ ++ Product: PowerEdge FC430 ++ DIMM_A1: 0.1.0; DIMM_A2: 0.0.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0; ++ ++ DIMM_B1: 1.1.0; DIMM_B2: 1.0.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0; + + # 4-socket +-# # PowerEdge M820 +- Model: 0RN9TC, 0YWR73, 066N7P, 0PFG1N, 0JC2W3 ++ Product: PowerEdge M820, PowerEdge R830, PowerEdge M830, PowerEdge R930, PowerEdge FC830 + DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0; + DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1; + DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2; +@@ -90,3 +97,12 @@ DIMM_C9: 2.0.2; DIMM_C10: 2.1.2; + DIMM_D1: 3.0.0; DIMM_D2: 3.1.0; DIMM_D3: 3.2.0; DIMM_D4: 3.3.0; + DIMM_D5: 3.0.1; DIMM_D6: 3.1.1; DIMM_D7: 3.2.1; DIMM_D8: 3.3.1; + DIMM_D9: 3.0.2; DIMM_D10: 3.1.2; DIMM_D11: 3.2.2; DIMM_D12: 3.3.2; ++ ++ Product: PowerEdge FM120x4 ++ DIMM_A_A1: 0.1.0; DIMM_A_A2: 0.2.0; ++ ++ DIMM_B_A1: 1.1.0; DIMM_B_A2: 1.2.0; ++ ++ DIMM_C_A1: 2.1.0; DIMM_C_A2: 2.2.0; ++ ++ DIMM_D_A1: 3.1.0; DIMM_D_A2: 3.2.0; diff --git a/SOURCES/0066-rasdaemon-Update-DIMM-labels-for-Intel-Skylake-serve.patch b/SOURCES/0066-rasdaemon-Update-DIMM-labels-for-Intel-Skylake-serve.patch new file mode 100644 index 0000000..73a73b5 --- /dev/null +++ b/SOURCES/0066-rasdaemon-Update-DIMM-labels-for-Intel-Skylake-serve.patch @@ -0,0 +1,69 @@ +From 993b8c40bd0c09a177d52c4f41b09ef2c969fa8d Mon Sep 17 00:00:00 2001 +From: "Charles.Rose@dell.com" <Charles.Rose@dell.com> +Date: Fri, 11 Aug 2017 20:09:10 +0000 +Subject: [PATCH] rasdaemon: Update DIMM labels for Intel Skylake servers + +Update labels for Intel Skylake based Dell PowerEdge servers. + +Signed-off-by: Charles Rose <charles_rose@dell.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com> +--- + labels/dell | 31 +++++++++++++++++++++++++++++++ + 1 file changed, 31 insertions(+) + +diff --git a/labels/dell b/labels/dell +index 5abcd90..58455df 100644 +--- a/labels/dell ++++ b/labels/dell +@@ -50,6 +50,13 @@ Vendor: Dell Inc. + DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1; + DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2; + ++ Product: PowerEdge R640, PowerEdge R740, PowerEdge R740xd ++ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0; ++ A7: 0.0.1; A8: 0.1.1; A9: 0.2.1; A10: 1.0.1; A11: 1.1.1; A12: 1.2.1; ++ ++ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0; ++ B7: 2.0.1; B8: 2.1.1; B9: 2.2.1; B10: 3.0.1; B11: 3.1.1; B12: 3.2.1; ++ + Product: PowerEdge M520, PowerEdge R420, PowerEdge T420 + DIMM_A1: 0.1.0; DIMM_A2: 0.2.0; DIMM_A3: 0.3.0; + DIMM_A4: 0.1.1; DIMM_A5: 0.2.1; DIMM_A6: 0.3.1; +@@ -69,6 +76,17 @@ Vendor: Dell Inc. + DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0; + DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1; + ++ Product: PowerEdge C6320p ++ A1: 0.0.0; B1: 0.1.0; C1: 0.2.0; ++ D1: 1.0.0; E1: 1.1.0; F1: 1.2.0; ++ ++ Product: PowerEdge C6420 ++ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0; ++ A7: 0.0.1; A8: 1.0.1; ++ ++ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0; ++ B7: 2.0.1; B8: 3.0.1; ++ + Product: PowerEdge R430, PowerEdge T430, PowerEdge R530 + DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0; + DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1; +@@ -106,3 +124,16 @@ Vendor: Dell Inc. + DIMM_C_A1: 2.1.0; DIMM_C_A2: 2.2.0; + + DIMM_D_A1: 3.1.0; DIMM_D_A2: 3.2.0; ++ ++ Product: PowerEdge R940 ++ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0; ++ A7: 0.0.1; A8: 0.1.1; A9: 0.2.1; A10: 1.0.1; A11: 1.1.1; A12: 1.2.1; ++ ++ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0; ++ B7: 2.0.1; B8: 2.1.1; B9: 2.2.1; B10: 3.0.1; B11: 3.1.1; B12: 3.2.1; ++ ++ C1: 4.0.0; C2: 4.1.0; C3: 4.2.0; C4: 5.0.0; C5: 5.1.0; C6: 5.2.0; ++ C7: 4.0.1; C8: 4.1.1; C9: 4.2.1; C10: 5.0.1; C11: 5.1.1; C12: 5.2.1; ++ ++ D1: 6.0.0; D2: 6.1.0; D3: 6.2.0; D4: 7.0.0; D5: 7.1.0; D6: 7.2.0; ++ D7: 6.0.1; D8: 6.1.1; D9: 6.2.1; D10: 7.0.1; D11: 7.1.1; D12: 7.2.1; +-- +1.8.3.1 + diff --git a/SOURCES/0067-rasdaemon-add-support-for-non-standard-CPER-section-.patch b/SOURCES/0067-rasdaemon-add-support-for-non-standard-CPER-section-.patch new file mode 100644 index 0000000..088d0f3 --- /dev/null +++ b/SOURCES/0067-rasdaemon-add-support-for-non-standard-CPER-section-.patch @@ -0,0 +1,601 @@ +From 624d8a1d99a2f3bd06cbc537aff3cc30201ba7c2 Mon Sep 17 00:00:00 2001 +From: Tyler Baicar <tbaicar@codeaurora.org> +Date: Mon, 12 Jun 2017 16:16:04 -0600 +Subject: [PATCH 1/2] rasdaemon: add support for non standard CPER section + events + +Add support to handle the non standard CPER section kernel trace +events which cover RAS errors who's section type is unknown. + +Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> +Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com> +--- + Makefile.am | 3 + + configure.ac | 9 +++ + ras-events.c | 15 +++++ + ras-events.h | 8 +++ + ras-non-standard-handler.c | 147 +++++++++++++++++++++++++++++++++++++++++++++ + ras-non-standard-handler.h | 26 ++++++++ + ras-record.c | 59 ++++++++++++++++++ + ras-record.h | 15 +++++ + ras-report.c | 80 ++++++++++++++++++++++++ + ras-report.h | 18 +++++- + 10 files changed, 379 insertions(+), 1 deletion(-) + create mode 100644 ras-non-standard-handler.c + create mode 100644 ras-non-standard-handler.h + +diff --git a/Makefile.am b/Makefile.am +index a10e4b3..c5811e8 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -24,6 +24,9 @@ endif + if WITH_AER + rasdaemon_SOURCES += ras-aer-handler.c + endif ++if WITH_NON_STANDARD ++ rasdaemon_SOURCES += ras-non-standard-handler.c ++endif + if WITH_MCE + rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \ + mce-intel-p4-p6.c mce-intel-nehalem.c \ +diff --git a/configure.ac b/configure.ac +index 5af5227..31bf6bd 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -44,6 +44,15 @@ AS_IF([test "x$enable_aer" = "xyes"], [ + ]) + AM_CONDITIONAL([WITH_AER], [test x$enable_aer = xyes]) + ++AC_ARG_ENABLE([non_standard], ++ AS_HELP_STRING([--enable-non-standard], [enable NON_STANDARD events (currently experimental)])) ++ ++AS_IF([test "x$enable_non_standard" = "xyes"], [ ++ AC_DEFINE(HAVE_NON_STANDARD,1,"have UNKNOWN_SEC events collect") ++ AC_SUBST([WITH_NON_STANDARD]) ++]) ++AM_CONDITIONAL([WITH_NON_STANDARD], [test x$enable_non_standard = xyes]) ++ + AC_ARG_ENABLE([mce], + AS_HELP_STRING([--enable-mce], [enable MCE events (currently experimental)])) + +diff --git a/ras-events.c b/ras-events.c +index 0be7c3f..96aa6f1 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -29,6 +29,7 @@ + #include "libtrace/event-parse.h" + #include "ras-mc-handler.h" + #include "ras-aer-handler.h" ++#include "ras-non-standard-handler.h" + #include "ras-mce-handler.h" + #include "ras-extlog-handler.h" + #include "ras-record.h" +@@ -208,6 +209,10 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "ras", "extlog_mem_event", enable); + #endif + ++#ifdef HAVE_NON_STANDARD ++ rc |= __toggle_ras_mc_event(ras, "ras", "non_standard_event", enable); ++#endif ++ + free_ras: + free(ras); + return rc; +@@ -676,6 +681,16 @@ int handle_ras_events(int record_events) + "ras", "aer_event"); + #endif + ++#ifdef HAVE_NON_STANDARD ++ rc = add_event_handler(ras, pevent, page_size, "ras", "non_standard_event", ++ ras_non_standard_event_handler); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "ras", "non_standard_event"); ++#endif ++ + cpus = get_num_cpus(ras); + + #ifdef HAVE_MCE +diff --git a/ras-events.h b/ras-events.h +index 64e045a..3e1008f 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -68,6 +68,14 @@ enum hw_event_mc_err_type { + HW_EVENT_ERR_INFO, + }; + ++/* Should match the code at Kernel's include/acpi/ghes.h */ ++enum ghes_severity { ++ GHES_SEV_NO, ++ GHES_SEV_CORRECTED, ++ GHES_SEV_RECOVERABLE, ++ GHES_SEV_PANIC, ++}; ++ + /* Function prototypes */ + int toggle_ras_mc_event(int enable); + int handle_ras_events(int record_events); +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +new file mode 100644 +index 0000000..4c154e5 +--- /dev/null ++++ b/ras-non-standard-handler.c +@@ -0,0 +1,147 @@ ++/* ++ * Copyright (c) 2016, The Linux Foundation. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 and ++ * only version 2 as published by the Free Software Foundation. ++ ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++#include <string.h> ++#include <unistd.h> ++#include "libtrace/kbuffer.h" ++#include "ras-non-standard-handler.h" ++#include "ras-record.h" ++#include "ras-logger.h" ++#include "ras-report.h" ++ ++void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) { ++ trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]); ++} ++ ++static char *uuid_le(const char *uu) ++{ ++ static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; ++ char *p = uuid; ++ int i; ++ static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; ++ ++ for (i = 0; i < 16; i++) { ++ p += sprintf(p, "%.2x", uu[le[i]]); ++ switch (i) { ++ case 3: ++ case 5: ++ case 7: ++ case 9: ++ *p++ = '-'; ++ break; ++ } ++ } ++ ++ *p = 0; ++ ++ return uuid; ++} ++ ++int ras_non_standard_event_handler(struct trace_seq *s, ++ struct pevent_record *record, ++ struct event_format *event, void *context) ++{ ++ int len, i, line_count; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ time_t now; ++ struct tm *tm; ++ struct ras_non_standard_event ev; ++ ++ /* ++ * Newer kernels (3.10-rc1 or upper) provide an uptime clock. ++ * On previous kernels, the way to properly generate an event would ++ * be to inject a fake one, measure its timestamp and diff it against ++ * gettimeofday. We won't do it here. Instead, let's use uptime, ++ * falling-back to the event report's time, if "uptime" clock is ++ * not available (legacy kernels). ++ */ ++ ++ if (ras->use_uptime) ++ now = record->ts/user_hz + ras->uptime_diff; ++ else ++ now = time(NULL); ++ ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ trace_seq_printf(s, "%s ", ev.timestamp); ++ ++ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0) ++ return -1; ++ switch (val) { ++ case GHES_SEV_NO: ++ ev.severity = "Informational"; ++ break; ++ case GHES_SEV_CORRECTED: ++ ev.severity = "Corrected"; ++ break; ++ case GHES_SEV_RECOVERABLE: ++ ev.severity = "Recoverable"; ++ break; ++ default: ++ case GHES_SEV_PANIC: ++ ev.severity = "Fatal"; ++ } ++ trace_seq_printf(s, "\n %s", ev.severity); ++ ++ ev.sec_type = pevent_get_field_raw(s, event, "sec_type", record, &len, 1); ++ if(!ev.sec_type) ++ return -1; ++ trace_seq_printf(s, "\n section type: %s", uuid_le(ev.sec_type)); ++ ev.fru_text = pevent_get_field_raw(s, event, "fru_text", ++ record, &len, 1); ++ ev.fru_id = pevent_get_field_raw(s, event, "fru_id", ++ record, &len, 1); ++ trace_seq_printf(s, " fru text: %s fru id: %s ", ++ ev.fru_text, ++ uuid_le(ev.fru_id)); ++ ++ if (pevent_get_field_val(s, event, "len", record, &val, 1) < 0) ++ return -1; ++ ev.length = val; ++ trace_seq_printf(s, "\n length: %d\n", ev.length); ++ ++ ev.error = pevent_get_field_raw(s, event, "buf", record, &len, 1); ++ if(!ev.error) ++ return -1; ++ len = ev.length; ++ i = 0; ++ line_count = 0; ++ trace_seq_printf(s, " error:\n %08x: ", i); ++ while(len >= 4) { ++ print_le_hex(s, ev.error, i); ++ i+=4; ++ len-=4; ++ if(++line_count == 4) { ++ trace_seq_printf(s, "\n %08x: ", i); ++ line_count = 0; ++ } else ++ trace_seq_printf(s, " "); ++ } ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_non_standard_record(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_non_standard_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h +new file mode 100644 +index 0000000..2b5ac35 +--- /dev/null ++++ b/ras-non-standard-handler.h +@@ -0,0 +1,26 @@ ++/* ++ * Copyright (c) 2016, The Linux Foundation. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 and ++ * only version 2 as published by the Free Software Foundation. ++ ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#ifndef __RAS_NON_STANDARD_HANDLER_H ++#define __RAS_NON_STANDARD_HANDLER_H ++ ++#include "ras-events.h" ++#include "libtrace/event-parse.h" ++ ++int ras_non_standard_event_handler(struct trace_seq *s, ++ struct pevent_record *record, ++ struct event_format *event, void *context); ++ ++void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index); ++ ++#endif +diff --git a/ras-record.c b/ras-record.c +index 3dc4493..357ab61 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -1,5 +1,6 @@ + /* + * Copyright (C) 2013 Mauro Carvalho Chehab <mchehab@redhat.com> ++ * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by +@@ -157,6 +158,57 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) + } + #endif + ++/* ++ * Table and functions to handle ras:non standard ++ */ ++ ++#ifdef HAVE_NON_STANDARD ++static const struct db_fields non_standard_event_fields[] = { ++ { .name="id", .type="INTEGER PRIMARY KEY" }, ++ { .name="timestamp", .type="TEXT" }, ++ { .name="sec_type", .type="BLOB" }, ++ { .name="fru_id", .type="BLOB" }, ++ { .name="fru_text", .type="TEXT" }, ++ { .name="severity", .type="TEXT" }, ++ { .name="error", .type="BLOB" }, ++}; ++ ++static const struct db_table_descriptor non_standard_event_tab = { ++ .name = "non_standard_event", ++ .fields = non_standard_event_fields, ++ .num_fields = ARRAY_SIZE(non_standard_event_fields), ++}; ++ ++int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_non_standard_record) ++ return 0; ++ log(TERM, LOG_INFO, "non_standard_event store: %p\n", priv->stmt_non_standard_record); ++ ++ sqlite3_bind_text (priv->stmt_non_standard_record, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_blob (priv->stmt_non_standard_record, 2, ev->sec_type, -1, NULL); ++ sqlite3_bind_blob (priv->stmt_non_standard_record, 3, ev->fru_id, 16, NULL); ++ sqlite3_bind_text (priv->stmt_non_standard_record, 4, ev->fru_text, -1, NULL); ++ sqlite3_bind_text (priv->stmt_non_standard_record, 5, ev->severity, -1, NULL); ++ sqlite3_bind_blob (priv->stmt_non_standard_record, 6, ev->error, ev->length, NULL); ++ ++ rc = sqlite3_step(priv->stmt_non_standard_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do non_standard_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_non_standard_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset non_standard_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif ++ + #ifdef HAVE_EXTLOG + static const struct db_fields extlog_event_fields[] = { + { .name="id", .type="INTEGER PRIMARY KEY" }, +@@ -450,6 +502,13 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + &mce_record_tab); + #endif + ++#ifdef HAVE_NON_STANDARD ++ rc = ras_mc_create_table(priv, &non_standard_event_tab); ++ if (rc == SQLITE_OK) ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_non_standard_record, ++ &non_standard_event_tab); ++#endif ++ + ras->db_priv = priv; + return 0; + } +diff --git a/ras-record.h b/ras-record.h +index 5d84297..473ae40 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -1,5 +1,6 @@ + /* + * Copyright (C) 2013 Mauro Carvalho Chehab <mchehab@redhat.com> ++ * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by +@@ -56,9 +57,18 @@ struct ras_extlog_event { + unsigned short cper_data_length; + }; + ++struct ras_non_standard_event { ++ char timestamp[64]; ++ const char *sec_type, *fru_id, *fru_text; ++ const char *severity; ++ const uint8_t *error; ++ uint32_t length; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; ++struct ras_non_standard_event; + struct mce_event; + + #ifdef HAVE_SQLITE3 +@@ -77,6 +87,9 @@ struct sqlite3_priv { + #ifdef HAVE_EXTLOG + sqlite3_stmt *stmt_extlog_record; + #endif ++#ifdef HAVE_NON_STANDARD ++ sqlite3_stmt *stmt_non_standard_record; ++#endif + }; + + int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras); +@@ -84,6 +97,7 @@ int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev); + int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev); + int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev); + int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev); ++int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -91,6 +105,7 @@ static inline int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event + static inline int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; }; + static inline int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) { return 0; }; + static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; }; ++static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 0a05732..1eb9f79 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -1,3 +1,16 @@ ++/* ++ * Copyright (c) 2016, The Linux Foundation. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 and ++ * only version 2 as published by the Free Software Foundation. ++ ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ + #include <stdio.h> + #include <string.h> + #include <unistd.h> +@@ -196,6 +209,25 @@ static int set_aer_event_backtrace(char *buf, struct ras_aer_event *ev){ + return 0; + } + ++static int set_non_standard_event_backtrace(char *buf, struct ras_non_standard_event *ev){ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if(!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "severity=%s\n" \ ++ "length=%d\n", \ ++ ev->timestamp, \ ++ ev->severity, \ ++ ev->length); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -218,6 +250,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case MCE_EVENT: + rc = set_mce_event_backtrace(buf, (struct mce_event *)ev); + break; ++ case NON_STANDARD_EVENT: ++ rc = set_non_standard_event_backtrace(buf, (struct ras_non_standard_event *)ev); ++ break; + default: + return -1; + } +@@ -345,6 +380,51 @@ aer_fail: + } + } + ++int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev){ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if(sockfd < 0){ ++ return rc; ++ } ++ ++ rc = commit_report_basic(sockfd); ++ if(rc < 0){ ++ goto non_standard_fail; ++ } ++ ++ rc = commit_report_backtrace(sockfd, NON_STANDARD_EVENT, ev); ++ if(rc < 0){ ++ goto non_standard_fail; ++ } ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-non-standard"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ goto non_standard_fail; ++ } ++ ++ sprintf(buf, "REASON=%s", "Unknown CPER section problem"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ goto non_standard_fail; ++ } ++ ++ rc = 0; ++ ++non_standard_fail: ++ ++ if(sockfd > 0){ ++ close(sockfd); ++ } ++ ++ return rc; ++} ++ + int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ + char buf[MAX_MESSAGE_SIZE]; + int sockfd = 0; +diff --git a/ras-report.h b/ras-report.h +index 7920cdf..c2fcf42 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -1,3 +1,16 @@ ++/* ++ * Copyright (c) 2016, The Linux Foundation. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 and ++ * only version 2 as published by the Free Software Foundation. ++ ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ + #ifndef __RAS_REPORT_H + #define __RAS_REPORT_H + +@@ -19,7 +32,8 @@ + enum { + MC_EVENT, + MCE_EVENT, +- AER_EVENT ++ AER_EVENT, ++ NON_STANDARD_EVENT + }; + + #ifdef HAVE_ABRT_REPORT +@@ -27,12 +41,14 @@ enum { + int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev); + int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev); + int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev); ++int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev); + + #else + + static inline int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; }; + static inline int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; }; + static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; }; ++static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; }; + + #endif + +-- +1.8.3.1 + diff --git a/SOURCES/0068-rasdaemon-add-support-for-non-standard-error-decoder.patch b/SOURCES/0068-rasdaemon-add-support-for-non-standard-error-decoder.patch new file mode 100644 index 0000000..91a6cc2 --- /dev/null +++ b/SOURCES/0068-rasdaemon-add-support-for-non-standard-error-decoder.patch @@ -0,0 +1,150 @@ +From 873e88d6ba1ce5ec97f5cc0f4f0b45dfd2026b9f Mon Sep 17 00:00:00 2001 +From: "shiju.jose@huawei.com" <shiju.jose@huawei.com> +Date: Wed, 4 Oct 2017 10:11:08 +0100 +Subject: [PATCH] rasdaemon:add support for non-standard error decoder + +This patch add support to decode the non-standard +error information. + +Signed-off-by: Shiju Jose <shiju.jose@huawei.com> +Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com> +--- + ras-non-standard-handler.c | 62 +++++++++++++++++++++++++++++++++++++++++++++- + ras-non-standard-handler.h | 10 ++++++++ + 2 files changed, 71 insertions(+), 1 deletion(-) + +diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c +index 4c154e5..21e6a76 100644 +--- a/ras-non-standard-handler.c ++++ b/ras-non-standard-handler.c +@@ -13,6 +13,7 @@ + + #include <stdio.h> + #include <stdlib.h> ++#include <stdbool.h> + #include <string.h> + #include <unistd.h> + #include "libtrace/kbuffer.h" +@@ -21,6 +22,31 @@ + #include "ras-logger.h" + #include "ras-report.h" + ++static p_ns_dec_tab * ns_dec_tab; ++static size_t dec_tab_count; ++ ++int register_ns_dec_tab(const p_ns_dec_tab tab) ++{ ++ ns_dec_tab = (p_ns_dec_tab *)realloc(ns_dec_tab, ++ (dec_tab_count + 1) * sizeof(tab)); ++ if (ns_dec_tab == NULL) { ++ printf("%s p_ns_dec_tab malloc failed", __func__); ++ return -1; ++ } ++ ns_dec_tab[dec_tab_count] = tab; ++ dec_tab_count++; ++ return 0; ++} ++ ++void unregister_ns_dec_tab(void) ++{ ++ if (ns_dec_tab) { ++ free(ns_dec_tab); ++ ns_dec_tab = NULL; ++ dec_tab_count = 0; ++ } ++} ++ + void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) { + trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]); + } +@@ -49,16 +75,32 @@ static char *uuid_le(const char *uu) + return uuid; + } + ++static int uuid_le_cmp(const char *sec_type, const char *uuid2) ++{ ++ static char uuid1[32]; ++ char *p = uuid1; ++ int i; ++ static const unsigned char le[16] = { ++ 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15}; ++ ++ for (i = 0; i < 16; i++) ++ p += sprintf(p, "%.2x", sec_type[le[i]]); ++ *p = 0; ++ return strncmp(uuid1, uuid2, 32); ++} ++ + int ras_non_standard_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context) + { +- int len, i, line_count; ++ int len, i, line_count, count; + unsigned long long val; + struct ras_events *ras = context; + time_t now; + struct tm *tm; + struct ras_non_standard_event ev; ++ p_ns_dec_tab dec_tab; ++ bool dec_done = false; + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -133,6 +175,18 @@ int ras_non_standard_event_handler(struct trace_seq *s, + trace_seq_printf(s, " "); + } + ++ for (count = 0; count < dec_tab_count && !dec_done; count++) { ++ dec_tab = ns_dec_tab[count]; ++ for (i = 0; i < dec_tab[0].len; i++) { ++ if (uuid_le_cmp(ev.sec_type, ++ dec_tab[i].sec_type) == 0) { ++ dec_tab[i].decode(s, ev.error); ++ dec_done = true; ++ break; ++ } ++ } ++ } ++ + /* Insert data into the SGBD */ + #ifdef HAVE_SQLITE3 + ras_store_non_standard_record(ras, &ev); +@@ -145,3 +199,9 @@ int ras_non_standard_event_handler(struct trace_seq *s, + + return 0; + } ++ ++__attribute__((destructor)) ++static void ns_exit(void) ++{ ++ unregister_ns_dec_tab(); ++} +diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h +index 2b5ac35..a183d1a 100644 +--- a/ras-non-standard-handler.h ++++ b/ras-non-standard-handler.h +@@ -17,10 +17,20 @@ + #include "ras-events.h" + #include "libtrace/event-parse.h" + ++typedef struct ras_ns_dec_tab { ++ const char *sec_type; ++ int (*decode)(struct trace_seq *s, const void *err); ++ size_t len; ++} *p_ns_dec_tab; ++ + int ras_non_standard_event_handler(struct trace_seq *s, + struct pevent_record *record, + struct event_format *event, void *context); + + void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index); + ++int register_ns_dec_tab(const p_ns_dec_tab tab); ++ ++void unregister_ns_dec_tab(void); ++ + #endif +-- +1.8.3.1 + diff --git a/SOURCES/0069-rasdaemon-add-support-for-ARM-events.patch b/SOURCES/0069-rasdaemon-add-support-for-ARM-events.patch new file mode 100644 index 0000000..c2c915b --- /dev/null +++ b/SOURCES/0069-rasdaemon-add-support-for-ARM-events.patch @@ -0,0 +1,489 @@ +From 5662e5376adcc45da43d7818c8ac1882883c18ac Mon Sep 17 00:00:00 2001 +From: Tyler Baicar <tbaicar@codeaurora.org> +Date: Tue, 12 Sep 2017 14:58:25 -0600 +Subject: [PATCH 1/2] rasdaemon: add support for ARM events + +Add support to handle the ARM kernel trace events +which cover RAS ARM processor errors. + +[V4]: fix arm_event_tab usage + +Change-Id: Ife99c97042498d5fad4d9b8e873ecfba6a47947d +Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> +Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com> +--- + Makefile.am | 3 ++ + configure.ac | 9 ++++++ + ras-arm-handler.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + ras-arm-handler.h | 24 +++++++++++++++ + ras-events.c | 15 ++++++++++ + ras-record.c | 59 ++++++++++++++++++++++++++++++++++++ + ras-record.h | 16 ++++++++++ + ras-report.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++ + ras-report.h | 5 +++- + 9 files changed, 295 insertions(+), 1 deletion(-) + create mode 100644 ras-arm-handler.c + create mode 100644 ras-arm-handler.h + +diff --git a/Makefile.am b/Makefile.am +index 2500772..4aa5543 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -27,6 +27,9 @@ endif + if WITH_NON_STANDARD + rasdaemon_SOURCES += ras-non-standard-handler.c + endif ++if WITH_ARM ++ rasdaemon_SOURCES += ras-arm-handler.c ++endif + if WITH_MCE + rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \ + mce-intel-p4-p6.c mce-intel-nehalem.c \ +diff --git a/configure.ac b/configure.ac +index ecd4b2f..14fc2f2 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -53,6 +53,15 @@ AS_IF([test "x$enable_non_standard" = "xyes"], [ + ]) + AM_CONDITIONAL([WITH_NON_STANDARD], [test x$enable_non_standard = xyes]) + ++AC_ARG_ENABLE([arm], ++ AS_HELP_STRING([--enable-arm], [enable ARM events (currently experimental)])) ++ ++AS_IF([test "x$enable_arm" = "xyes"], [ ++ AC_DEFINE(HAVE_ARM,1,"have ARM events collect") ++ AC_SUBST([WITH_ARM]) ++]) ++AM_CONDITIONAL([WITH_ARM], [test x$enable_arm = xyes]) ++ + AC_ARG_ENABLE([mce], + AS_HELP_STRING([--enable-mce], [enable MCE events (currently experimental)])) + +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +new file mode 100644 +index 0000000..a76470d +--- /dev/null ++++ b/ras-arm-handler.c +@@ -0,0 +1,90 @@ ++/* ++ * Copyright (c) 2016, The Linux Foundation. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 and ++ * only version 2 as published by the Free Software Foundation. ++ ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++#include <string.h> ++#include <unistd.h> ++#include "libtrace/kbuffer.h" ++#include "ras-arm-handler.h" ++#include "ras-record.h" ++#include "ras-logger.h" ++#include "ras-report.h" ++ ++int ras_arm_event_handler(struct trace_seq *s, ++ struct pevent_record *record, ++ struct event_format *event, void *context) ++{ ++ unsigned long long val; ++ struct ras_events *ras = context; ++ time_t now; ++ struct tm *tm; ++ struct ras_arm_event ev; ++ ++ /* ++ * Newer kernels (3.10-rc1 or upper) provide an uptime clock. ++ * On previous kernels, the way to properly generate an event would ++ * be to inject a fake one, measure its timestamp and diff it against ++ * gettimeofday. We won't do it here. Instead, let's use uptime, ++ * falling-back to the event report's time, if "uptime" clock is ++ * not available (legacy kernels). ++ */ ++ ++ if (ras->use_uptime) ++ now = record->ts/user_hz + ras->uptime_diff; ++ else ++ now = time(NULL); ++ ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ trace_seq_printf(s, "%s\n", ev.timestamp); ++ ++ if (pevent_get_field_val(s, event, "affinity", record, &val, 1) < 0) ++ return -1; ++ ev.affinity = val; ++ trace_seq_printf(s, " affinity: %d", ev.affinity); ++ ++ if (pevent_get_field_val(s, event, "mpidr", record, &val, 1) < 0) ++ return -1; ++ ev.mpidr = val; ++ trace_seq_printf(s, "\n MPIDR: 0x%llx", (unsigned long long)ev.mpidr); ++ ++ if (pevent_get_field_val(s, event, "midr", record, &val, 1) < 0) ++ return -1; ++ ev.midr = val; ++ trace_seq_printf(s, "\n MIDR: 0x%llx", (unsigned long long)ev.midr); ++ ++ if (pevent_get_field_val(s, event, "running_state", record, &val, 1) < 0) ++ return -1; ++ ev.running_state = val; ++ trace_seq_printf(s, "\n running_state: %d", ev.running_state); ++ ++ if (pevent_get_field_val(s, event, "psci_state", record, &val, 1) < 0) ++ return -1; ++ ev.psci_state = val; ++ trace_seq_printf(s, "\n psci_state: %d", ev.psci_state); ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_arm_record(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_arm_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-arm-handler.h b/ras-arm-handler.h +new file mode 100644 +index 0000000..eae10ec +--- /dev/null ++++ b/ras-arm-handler.h +@@ -0,0 +1,24 @@ ++/* ++ * Copyright (c) 2016, The Linux Foundation. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 and ++ * only version 2 as published by the Free Software Foundation. ++ ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#ifndef __RAS_ARM_HANDLER_H ++#define __RAS_ARM_HANDLER_H ++ ++#include "ras-events.h" ++#include "libtrace/event-parse.h" ++ ++int ras_arm_event_handler(struct trace_seq *s, ++ struct pevent_record *record, ++ struct event_format *event, void *context); ++ ++#endif +diff --git a/ras-events.c b/ras-events.c +index 96aa6f1..812d712 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -30,6 +30,7 @@ + #include "ras-mc-handler.h" + #include "ras-aer-handler.h" + #include "ras-non-standard-handler.h" ++#include "ras-arm-handler.h" + #include "ras-mce-handler.h" + #include "ras-extlog-handler.h" + #include "ras-record.h" +@@ -213,6 +214,10 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "ras", "non_standard_event", enable); + #endif + ++#ifdef HAVE_ARM ++ rc |= __toggle_ras_mc_event(ras, "ras", "arm_event", enable); ++#endif ++ + free_ras: + free(ras); + return rc; +@@ -691,6 +696,16 @@ int handle_ras_events(int record_events) + "ras", "non_standard_event"); + #endif + ++#ifdef HAVE_ARM ++ rc = add_event_handler(ras, pevent, page_size, "ras", "arm_event", ++ ras_arm_event_handler); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "ras", "arm_event"); ++#endif ++ + cpus = get_num_cpus(ras); + + #ifdef HAVE_MCE +diff --git a/ras-record.c b/ras-record.c +index 357ab61..c3644cb 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -209,6 +209,58 @@ int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standar + } + #endif + ++/* ++ * Table and functions to handle ras:arm ++ */ ++ ++#ifdef HAVE_ARM ++static const struct db_fields arm_event_fields[] = { ++ { .name="id", .type="INTEGER PRIMARY KEY" }, ++ { .name="timestamp", .type="TEXT" }, ++ { .name="error_count", .type="INTEGER" }, ++ { .name="affinity", .type="INTEGER" }, ++ { .name="mpidr", .type="INTEGER" }, ++ { .name="running_state", .type="INTEGER" }, ++ { .name="psci_state", .type="INTEGER" }, ++}; ++ ++static const struct db_table_descriptor arm_event_tab = { ++ .name = "arm_event", ++ .fields = arm_event_fields, ++ .num_fields = ARRAY_SIZE(arm_event_fields), ++}; ++ ++int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_arm_record) ++ return 0; ++ log(TERM, LOG_INFO, "arm_event store: %p\n", priv->stmt_arm_record); ++ ++ sqlite3_bind_text (priv->stmt_arm_record, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_int (priv->stmt_arm_record, 2, ev->error_count); ++ sqlite3_bind_int (priv->stmt_arm_record, 3, ev->affinity); ++ sqlite3_bind_int (priv->stmt_arm_record, 4, ev->mpidr); ++ sqlite3_bind_int (priv->stmt_arm_record, 5, ev->running_state); ++ sqlite3_bind_int (priv->stmt_arm_record, 6, ev->psci_state); ++ ++ rc = sqlite3_step(priv->stmt_arm_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do arm_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_arm_record); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset arm_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif ++ + #ifdef HAVE_EXTLOG + static const struct db_fields extlog_event_fields[] = { + { .name="id", .type="INTEGER PRIMARY KEY" }, +@@ -509,6 +561,13 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + &non_standard_event_tab); + #endif + ++#ifdef HAVE_ARM ++ rc = ras_mc_create_table(priv, &arm_event_tab); ++ if (rc == SQLITE_OK) ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_arm_record, ++ &arm_event_tab); ++#endif ++ + ras->db_priv = priv; + return 0; + } +diff --git a/ras-record.h b/ras-record.h +index 473ae40..12c2218 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -65,10 +65,21 @@ struct ras_non_standard_event { + uint32_t length; + }; + ++struct ras_arm_event { ++ char timestamp[64]; ++ int32_t error_count; ++ int8_t affinity; ++ int64_t mpidr; ++ int64_t midr; ++ int32_t running_state; ++ int32_t psci_state; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; + struct ras_non_standard_event; ++struct ras_arm_event; + struct mce_event; + + #ifdef HAVE_SQLITE3 +@@ -90,6 +101,9 @@ struct sqlite3_priv { + #ifdef HAVE_NON_STANDARD + sqlite3_stmt *stmt_non_standard_record; + #endif ++#ifdef HAVE_ARM ++ sqlite3_stmt *stmt_arm_record; ++#endif + }; + + int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras); +@@ -98,6 +112,7 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev); + int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev); + int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev); + int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev); ++int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -106,6 +121,7 @@ static inline int ras_store_aer_event(struct ras_events *ras, struct ras_aer_eve + static inline int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) { return 0; }; + static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; }; + static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; }; ++static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 1eb9f79..d4beee0 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -228,6 +228,33 @@ static int set_non_standard_event_backtrace(char *buf, struct ras_non_standard_e + return 0; + } + ++static int set_arm_event_backtrace(char *buf, struct ras_arm_event *ev){ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if(!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "error_count=%d\n" \ ++ "affinity=%d\n" \ ++ "mpidr=0x%lx\n" \ ++ "midr=0x%lx\n" \ ++ "running_state=%d\n" \ ++ "psci_state=%d\n", \ ++ ev->timestamp, \ ++ ev->error_count, \ ++ ev->affinity, \ ++ ev->mpidr, \ ++ ev->midr, \ ++ ev->running_state, \ ++ ev->psci_state); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -253,6 +280,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case NON_STANDARD_EVENT: + rc = set_non_standard_event_backtrace(buf, (struct ras_non_standard_event *)ev); + break; ++ case ARM_EVENT: ++ rc = set_arm_event_backtrace(buf, (struct ras_arm_event *)ev); ++ break; + default: + return -1; + } +@@ -425,6 +455,51 @@ non_standard_fail: + return rc; + } + ++int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev){ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if(sockfd < 0){ ++ return rc; ++ } ++ ++ rc = commit_report_basic(sockfd); ++ if(rc < 0){ ++ goto arm_fail; ++ } ++ ++ rc = commit_report_backtrace(sockfd, ARM_EVENT, ev); ++ if(rc < 0){ ++ goto arm_fail; ++ } ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-arm"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ goto arm_fail; ++ } ++ ++ sprintf(buf, "REASON=%s", "ARM CPU report problem"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if(rc < strlen(buf) + 1){ ++ goto arm_fail; ++ } ++ ++ rc = 0; ++ ++arm_fail: ++ ++ if(sockfd > 0){ ++ close(sockfd); ++ } ++ ++ return rc; ++} ++ + int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ + char buf[MAX_MESSAGE_SIZE]; + int sockfd = 0; +diff --git a/ras-report.h b/ras-report.h +index c2fcf42..6c466f5 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -33,7 +33,8 @@ enum { + MC_EVENT, + MCE_EVENT, + AER_EVENT, +- NON_STANDARD_EVENT ++ NON_STANDARD_EVENT, ++ ARM_EVENT + }; + + #ifdef HAVE_ABRT_REPORT +@@ -42,6 +43,7 @@ int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev); + int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev); + int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev); + int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev); ++int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev); + + #else + +@@ -49,6 +51,7 @@ static inline int ras_report_mc_event(struct ras_events *ras, struct ras_mc_even + static inline int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; }; + static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; }; + static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; }; ++static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; }; + + #endif + +-- +1.8.3.1 + diff --git a/SOURCES/0070-rasdaemon-ARM-fully-initialize-ras_arm_event.patch b/SOURCES/0070-rasdaemon-ARM-fully-initialize-ras_arm_event.patch new file mode 100644 index 0000000..34a7d34 --- /dev/null +++ b/SOURCES/0070-rasdaemon-ARM-fully-initialize-ras_arm_event.patch @@ -0,0 +1,27 @@ +commit 1b23bf7d97bacd1d00adb4404dfc5004df394358 +Author: Aristeu Rozanski <arozansk@redhat.com> +Date: Fri Feb 2 10:15:25 2018 -0500 + + ARM: initialize with 0 unused ras_arm_event members + + Issue found by covscan: + + 1. rasdaemon-0.4.1/ras-arm-handler.c:32: var_decl: Declaring variable "ev" without initializer. + 16. rasdaemon-0.4.1/ras-arm-handler.c:81: uninit_use_in_call: Using uninitialized value "ev.error_count" when calling "ras_store_arm_record". + 23. rasdaemon-0.4.1/ras-record.c:243:2: read_parm_fld: Reading a parameter field. + + Signed-off-by: Aristeu Rozanski <arozansk@redhat.com> + +diff --git a/ras-arm-handler.c b/ras-arm-handler.c +index a76470d..2f170e2 100644 +--- a/ras-arm-handler.c ++++ b/ras-arm-handler.c +@@ -31,6 +31,8 @@ int ras_arm_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_arm_event ev; + ++ memset(&ev, 0, sizeof(ev)); ++ + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. + * On previous kernels, the way to properly generate an event would diff --git a/SOURCES/rasdaemon-dont_use_memerror_log_enable_on_knl.patch b/SOURCES/rasdaemon-dont_use_memerror_log_enable_on_knl.patch new file mode 100644 index 0000000..da6cadc --- /dev/null +++ b/SOURCES/rasdaemon-dont_use_memerror_log_enable_on_knl.patch @@ -0,0 +1,24 @@ +diff --git a/mce-intel.c b/mce-intel.c +index bf68d9b..80e4b6f 100644 +--- a/mce-intel.c ++++ b/mce-intel.c +@@ -470,7 +470,6 @@ int set_intel_imc_log(enum cputype cputype, unsigned ncpus) + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + case CPU_HASWELL_EPEX: +- case CPU_KNIGHTS_LANDING: + msr = 0x17f; /* MSR_ERROR_CONTROL */ + bit = 0x2; /* MemError Log Enable */ + break; +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index b875512..f930fd1 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -228,7 +228,6 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus) + case CPU_SANDY_BRIDGE_EP: + case CPU_IVY_BRIDGE_EPEX: + case CPU_HASWELL_EPEX: +- case CPU_KNIGHTS_LANDING: + set_intel_imc_log(mce->cputype, ncpus); + default: + break; diff --git a/SPECS/rasdaemon.spec b/SPECS/rasdaemon.spec new file mode 100644 index 0000000..6a72dfb --- /dev/null +++ b/SPECS/rasdaemon.spec @@ -0,0 +1,302 @@ +%define _hardened_build 1 + +Name: rasdaemon +Version: 0.4.1 +Release: 32%{?dist} +Summary: Utility to receive RAS error tracings +Group: Applications/System +License: GPLv2 +URL: https://pagure.io/rasdaemon +Source0: http://mchehab.fedorapeople.org/%{name}-%{version}.tar.bz2 + +ExclusiveArch: %{ix86} x86_64 aarch64 %{power64} +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +BuildRequires: autoconf, automake, gettext-devel, libtool, sqlite-devel +Requires: hwdata, perl-DBD-SQLite +%ifnarch %{arm} +%ifnarch %{power64} +Requires: dmidecode +%endif +%endif + +Requires(post): systemd-units +Requires(preun): systemd-units +Requires(postun): systemd-units + +Patch1: 0001-ras-mc-ctl-Improve-error-summary-to-show-label-and-m.patch +Patch2: 0002-ras-record-make-the-code-more-generic.patch +Patch3: 0003-ras-record-rename-stmt-to-stmt_mc_event.patch +Patch4: 0004-ras-record-reorder-functions.patch +Patch5: 0005-ras-record-Make-the-code-easier-to-add-support-for-o.patch +Patch6: 0006-Add-support-to-record-AER-events.patch +Patch7: 0007-Add-support-to-store-MCE-events-at-the-database.patch +Patch8: 0008-ras-mc-ctl-add-summary-for-MCE-and-PCIe-AER-errors.patch +Patch9: 0009-ras-mc-ctl-report-errors-also-for-PCIe-AER-and-MCE.patch +Patch10: 0010-ras-mc-ctl-Fix-the-name-of-the-error-table-data.patch +Patch11: 0013-ras-mc-ctl-Improve-parser.patch +Patch12: 0014-ras-mc-ctl-Fix-label-register-with-2-layers.patch +Patch13: 0015-Add-an-example-of-labels-file.patch +Patch14: 0017-ras-mc-ctl-Fix-the-DIMM-layout-display.patch +Patch15: 0019-ras-mc-ctl-remove-completely-use-of-modprobe.patch +Patch16: 0022-mce-amd-k8.c-fix-a-warning.patch +Patch17: 0023-add-abrt-suppport-for-rasdaemon.patch +Patch18: 0026-rasdaemon-Add-record-option-to-rasdaemon-man-page.patch +Patch19: 0027-ras-mc-ctl-Print-useful-message-when-run-without-ras.patch +Patch20: 0028-Make-paths-in-the-systemd-services-configurable.patch +Patch21: 0031-Correct-ABRT-report-data.patch +Patch22: 0032-rasdaemon-handle-failures-of-snprintf.patch +Patch23: 0033-rasdaemon-correct-range-while-parsing-top-middle-and.patch +Patch24: 0034-rasdaemon-enable-recording-by-default.patch +Patch25: 0035-eMCA-support.patch +Patch26: 0036-rasdaemon-fix-some-errors-in-sqlite.patch +Patch27: 0037-rasdaemon-sqlite-truncates-some-MCE-fields-to-32-bit.patch +Patch28: 0038-rasdaemon-fix-mce-numfield-decoded-error.patch +Patch29: 0039-rasdaemon-do-not-assume-dimmX-directories-will-be-pr.patch +Patch30: 0040-rasdaemon-add-more-dell-labels.patch +Patch31: 0041-rasdaemon-add-support-for-Haswell.patch +Patch32: 0042-rasdaemon-decode-new-simple-error-code-number-6.patch +Patch33: 0043-rasdaemon-Add-missing-entry-to-Ivy-Bridge-memory-con.patch +Patch34: 0044-rasdaemon-Identify-Ivy-Bridge-properly.patch +Patch35: 0045-rasdaemon-add-support-for-Broadwell.patch +Patch36: 0046-rasdaemon-add-support-for-Knights-Landing.patch +Patch37: 0047-rasdaemon-properly-pring-message-strings-in-decode_b.patch +Patch38: 0048-rasdaemon-add-missing-semicolon-in-hsw_decode_model.patch +Patch39: 0049-rasdaemon-enable-IMC-status-usage-for-Haswell-E.patch +Patch40: 0050-rasdaemon-make-sure-the-error-is-valid-before-handli.patch +Patch41: 0051-rasdaemon-add-support-to-match-the-machine-by-system.patch +Patch42: 0052-rasdaemon-add-internal-errors-of-IA32_MC4_STATUS-for.patch +Patch43: 0053-rasdaemon-remove-a-space-from-mcgstatus_msg.patch +Patch44: 0054-rasdaemon-unnecessary-comma-for-empty-mc_location-st.patch +Patch45: 0055-rasdaemon-use-MCA-error-msg-as-error_msg.patch +Patch46: 0056-x86-rasdaemon-Add-support-to-log-Local-Machine-Check.patch +Patch47: 0057-rasdaemon-add-support-for-haswell-ex.patch +Patch48: 0058-rasdaemon-fix-typos-on-ras-mc-ctl-man-page.patch +Patch49: 0059-rasdaemon-Add-support-for-Knights-Landing-processor.patch +Patch50: 0060-mce-intel-knl-Fix-CodingStyle.patch +Patch51: 0061-Add-Broadwell-DE-MSCOD-values.patch +Patch52: 0062-Add-Broadwell-EP-EX-MSCOD-values.patch +# Patch53 was submitted upstream but not merged yet +Patch53: rasdaemon-dont_use_memerror_log_enable_on_knl.patch +Patch54: 0063-add_support_for_knights_mill.patch +Patch55: 0064-add_support_for_skylake.patch +Patch56: 0065-rasdaemon-Update-DIMM-labels-for-Dell-Servers.patch +Patch57: 0066-rasdaemon-Update-DIMM-labels-for-Intel-Skylake-serve.patch +Patch58: 0067-rasdaemon-add-support-for-non-standard-CPER-section-.patch +Patch59: 0068-rasdaemon-add-support-for-non-standard-error-decoder.patch +Patch60: 0069-rasdaemon-add-support-for-ARM-events.patch +Patch61: 0070-rasdaemon-ARM-fully-initialize-ras_arm_event.patch + +%description +%{name} is a RAS (Reliability, Availability and Serviceability) logging tool. +It currently records memory errors, using the EDAC tracing events. +EDAC is drivers in the Linux kernel that handle detection of ECC errors +from memory controllers for most chipsets on i386 and x86_64 architectures. +EDAC drivers for other architectures like arm also exists. +This userspace component consists of an init script which makes sure +EDAC drivers and DIMM labels are loaded at system startup, as well as +an utility for reporting current error counts from the EDAC sysfs files. + +%prep +%setup -q +%patch1 -p1 +%patch2 -p1 +%patch3 -p1 +%patch4 -p1 +%patch5 -p1 +%patch6 -p1 +%patch7 -p1 +%patch8 -p1 +%patch9 -p1 +%patch10 -p1 +%patch11 -p1 +%patch12 -p1 +%patch13 -p1 +%patch14 -p1 +%patch15 -p1 +%patch16 -p1 +%patch17 -p1 +%patch18 -p1 +%patch19 -p1 +%patch20 -p1 +%patch21 -p1 +%patch22 -p1 +%patch23 -p1 +%patch24 -p1 +%patch25 -p1 +%patch26 -p1 +%patch27 -p1 +%patch28 -p1 +%patch29 -p1 +%patch30 -p1 +%patch31 -p1 +%patch32 -p1 +%patch33 -p1 +%patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 +%patch38 -p1 +%patch39 -p1 +%patch40 -p1 +%patch41 -p1 +%patch42 -p1 +%patch43 -p1 +%patch44 -p1 +%patch45 -p1 +%patch46 -p1 +%patch47 -p1 +%patch48 -p1 +%patch49 -p1 +%patch50 -p1 +%patch51 -p1 +%patch52 -p1 +%patch53 -p1 +%patch54 -p1 +%patch55 -p1 +%patch56 -p1 +%patch57 -p1 +%patch58 -p1 +%patch59 -p1 +%patch60 -p1 +%patch61 -p1 + +%build +autoreconf -vfi +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-arm +make %{?_smp_mflags} + +%install +make install DESTDIR=%{buildroot} +install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service +install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service +install -D -p -m 0655 labels/* %{buildroot}%{_sysconfdir}/ras/dimm_labels.d +rm INSTALL %{buildroot}/usr/include/*.h + +%clean +rm -rf %{buildroot} + +%files +%defattr(-,root,root) +%doc AUTHORS ChangeLog COPYING README TODO +%{_sbindir}/rasdaemon +%{_sbindir}/ras-mc-ctl +%{_mandir}/*/* +%{_unitdir}/*.service +%{_sharedstatedir}/rasdaemon +%{_sysconfdir}/ras/dimm_labels.d + +%changelog +* Fri Feb 02 2018 Aristeu Rozanski <aris@redhat.com> 0.4.1-32.el7 +- Fixed covscan error [1520602] + +* Wed Jan 24 2018 Aristeu Rozanski <aris@redhat.com> 0.4.1-31.el7 +- Added ARM support [1520602] + +* Thu Oct 19 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-30.el7 +- Updated project url [1502400] + +* Wed Aug 23 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-29.el7 +- Updating Dell labels [1458938] + +* Tue May 30 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-28.el7 +- Bump release [1448113] + +* Tue May 30 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-28.el7 +- Identify as Knights Mill systems as such [1448113] + +* Mon May 8 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-27.el7 +- Fixed error found by covscan in the last patch [1377467] + +* Tue Apr 11 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-26.el7 +- add support for Skylake client and server [1377467] + +* Wed Mar 22 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-25.el7 +- add support for Knights Mill [1433862] + +* Wed Aug 24 2016 Aristeu Rozanski <aris@redaht.com> 0.4.1-24.el7 +- don't use MemError Log Enable on Knights Landing [1273326] + +* Fri Apr 15 2016 Aristeu Rozanski <aris@redhat.com> 0.4.1-23.el7 +- add Broadwell DE/EP/EX MSCOD values [1299512] + +* Mon Feb 08 2016 Aristeu Rozanski <aris@redhat.com> 0.4.1-22.el7 +- add missing upstream patches for Knights Landing [1273326] +- fix documentation typos [1247562] + +* Thu Dec 03 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-21.el7 +- add support for Knights Landing [1273326] + +* Wed Sep 30 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-20.el7 +- add support for Haswell EP/EX [1267137] + +* Mon Jul 27 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-19.el7 +- pull latest fixes by Seiichi Ikarashi from upstream [1243941] + +* Mon Jul 27 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-18.el7 +- don't depend on dmidecode on ppc64, fix typo [1244593] + +* Wed Jul 22 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-17.el7 +- don't depend on dmidecode on ppc64 [1244593] + +* Wed Jul 08 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-16.el7 +- allow label files to specify by system product name [1168340] + +* Wed Jun 03 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-15.el7 +- add support to Haswell and newer processors [1221912] + +* Tue Dec 16 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-14.el7 +- properly install the labels so it can be packaged [1073090] + +* Tue Dec 02 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-13.el7 +- updated labels patch to the new version submitted upstream [1073090] + +* Tue Nov 25 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-12.el7 +- fix on how sysfs tree is parsed for DIMMs [1073090] +- include new Dell labels [1073090] + +* Fri Oct 10 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-11.el7 +- don't require dmidecode for ppc64le [1151385] + +* Fri Aug 22 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-10.el7 +- use power64 macro instead, we have a driver enabled on ppc64 [1125663] + +* Mon Aug 18 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-9.el7 +- eMCA support [1085519] +- enable ppc64le [1125663] + +* Mon Jun 09 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-8.el7 +- Revert patch in 0.4.1-7.el7, replaced by a better patch +- Fix sizeof() usage on pointer [1035742] +- Added macro to build the package with stack protector [1092558] +- Handle failures of snprintf() [1035741] +- Fix range checking on signed char variables [1035746] +- Added aarch64 as architecture [1070973] +- Start recording by default [1117366] [1117367] + +* Fri Jan 17 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-7.el7 +- Fixed rasdaemon.service executable path [1043478] + +* Fri Dec 27 2013 Daniel Mach <dmach@redhat.com> - 0.4.1-6 +- Mass rebuild 2013-12-27 + +* Tue Aug 20 2013 Aristeu Rozanski <aris@redhat.com> 0.4.1-5.el7 +- Applied Jarod Wilson fixes required to pass rpmlint tests + +* Thu Aug 15 2013 Aristeu Rozanski <aris@redhat.com> 0.4.1-4.el7 +- Rebuild + +* Sun Jun 2 2013 Peter Robinson <pbrobinson@fedoraproject.org> 0.4.1-3 +- ARM has EDMA drivers (currently supported in Calxeda highbank) + +* Tue May 28 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-2 +- Fix the name of perl-DBD-SQLite package + +* Tue May 28 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-1 +- Updated to version 0.4.1 with contains some bug fixes + +* Mon May 27 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.0-1 +- Updated to version 0.4.0 and added support for mce, aer and sqlite3 storage + +* Mon May 20 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.3.0-1 +- Package created +