From e4bdcb77308e795a58ba030a7c27d11a62e98515 Mon Sep 17 00:00:00 2001 From: Alaa Hleihel Date: Tue, 19 May 2020 07:48:36 -0400 Subject: [PATCH 215/312] [netdrv] net/mlx5: E-Switch, Increase number of chains and priorities Message-id: <20200519074934.6303-6-ahleihel@redhat.com> Patchwork-id: 310507 Patchwork-instance: patchwork O-Subject: [RHEL8.3 BZ 1663246 05/63] net/mlx5: E-Switch, Increase number of chains and priorities Bugzilla: 1663246 RH-Acked-by: Marcelo Leitner RH-Acked-by: Jarod Wilson RH-Acked-by: John Linville RH-Acked-by: Ivan Vecera RH-Acked-by: Tony Camuso RH-Acked-by: Kamal Heib Bugzilla: http://bugzilla.redhat.com/1663246 Upstream: v5.6-rc1 commit 278d51f24330718aefd7fe86996a6da66fd345e7 Author: Paul Blakey Date: Wed Nov 20 15:06:19 2019 +0200 net/mlx5: E-Switch, Increase number of chains and priorities Increase the number of chains and priorities to support the whole range available in tc. We use unmanaged tables and ignore flow level to create more tables than what we declared to fs_core steering, and we manage the connections between the tables themselves. To support that we need FW with ignore_flow_level capability. Otherwise the old behaviour will be used, where we are limited by the number of levels we declared (4 chains, 16 prios). Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan Reviewed-by: Oz Shlomo Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed Signed-off-by: Alaa Hleihel Signed-off-by: Frantisek Hrbata --- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 +- .../mellanox/mlx5/core/eswitch_offloads_chains.c | 238 ++++++++++++++++++++- .../mellanox/mlx5/core/eswitch_offloads_chains.h | 3 + 3 files changed, 232 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index b8db12635730..7c33ce7ec074 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -151,7 +151,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, if (attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH) { flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; - dest[i].ft = esw->fdb_table.offloads.slow_fdb; + dest[i].ft = mlx5_esw_chains_get_tc_end_ft(esw); i++; } else if (attr->dest_chain) { flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; @@ -275,6 +275,7 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, if (attr->outer_match_level != MLX5_MATCH_NONE) spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; rule = mlx5_add_flow_rules(fast_fdb, spec, &flow_act, dest, i); if (IS_ERR(rule)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c index 589b94df252a..d569969afd9d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c @@ -16,6 +16,10 @@ #define esw_chains_ht(esw) (esw_chains_priv(esw)->chains_ht) #define esw_prios_ht(esw) (esw_chains_priv(esw)->prios_ht) #define fdb_pool_left(esw) (esw_chains_priv(esw)->fdb_left) +#define tc_slow_fdb(esw) ((esw)->fdb_table.offloads.slow_fdb) +#define tc_end_fdb(esw) (esw_chains_priv(esw)->tc_end_fdb) +#define fdb_ignore_flow_level_supported(esw) \ + (MLX5_CAP_ESW_FLOWTABLE_FDB((esw)->dev, ignore_flow_level)) #define ESW_OFFLOADS_NUM_GROUPS 4 @@ -39,6 +43,8 @@ struct mlx5_esw_chains_priv { /* Protects above chains_ht and prios_ht */ struct mutex lock; + struct mlx5_flow_table *tc_end_fdb; + int fdb_left[ARRAY_SIZE(ESW_POOLS)]; }; @@ -50,6 +56,7 @@ struct fdb_chain { int ref; struct mlx5_eswitch *esw; + struct list_head prios_list; }; struct fdb_prio_key { @@ -60,6 +67,7 @@ struct fdb_prio_key { struct fdb_prio { struct rhash_head node; + struct list_head list; struct fdb_prio_key key; @@ -67,6 +75,9 @@ struct fdb_prio { struct fdb_chain *fdb_chain; struct mlx5_flow_table *fdb; + struct mlx5_flow_table *next_fdb; + struct mlx5_flow_group *miss_group; + struct mlx5_flow_handle *miss_rule; }; static const struct rhashtable_params chain_params = { @@ -93,6 +104,9 @@ u32 mlx5_esw_chains_get_chain_range(struct mlx5_eswitch *esw) if (!mlx5_esw_chains_prios_supported(esw)) return 1; + if (fdb_ignore_flow_level_supported(esw)) + return UINT_MAX - 1; + return FDB_TC_MAX_CHAIN; } @@ -106,11 +120,17 @@ u32 mlx5_esw_chains_get_prio_range(struct mlx5_eswitch *esw) if (!mlx5_esw_chains_prios_supported(esw)) return 1; + if (fdb_ignore_flow_level_supported(esw)) + return UINT_MAX; + return FDB_TC_MAX_PRIO; } static unsigned int mlx5_esw_chains_get_level_range(struct mlx5_eswitch *esw) { + if (fdb_ignore_flow_level_supported(esw)) + return UINT_MAX; + return FDB_TC_LEVELS_PER_PRIO; } @@ -181,13 +201,40 @@ mlx5_esw_chains_create_fdb_table(struct mlx5_eswitch *esw, sz = mlx5_esw_chains_get_avail_sz_from_pool(esw, POOL_NEXT_SIZE); if (!sz) return ERR_PTR(-ENOSPC); - ft_attr.max_fte = sz; - ft_attr.level = level; - ft_attr.prio = prio - 1; - ft_attr.autogroup.max_num_groups = ESW_OFFLOADS_NUM_GROUPS; - ns = mlx5_get_fdb_sub_ns(esw->dev, chain); + /* We use tc_slow_fdb(esw) as the table's next_ft till + * ignore_flow_level is allowed on FT creation and not just for FTEs. + * Instead caller should add an explicit miss rule if needed. + */ + ft_attr.next_ft = tc_slow_fdb(esw); + + /* The root table(chain 0, prio 1, level 0) is required to be + * connected to the previous prio (FDB_BYPASS_PATH if exists). + * We always create it, as a managed table, in order to align with + * fs_core logic. + */ + if (!fdb_ignore_flow_level_supported(esw) || + (chain == 0 && prio == 1 && level == 0)) { + ft_attr.level = level; + ft_attr.prio = prio - 1; + ns = mlx5_get_fdb_sub_ns(esw->dev, chain); + } else { + ft_attr.flags |= MLX5_FLOW_TABLE_UNMANAGED; + ft_attr.prio = FDB_TC_OFFLOAD; + /* Firmware doesn't allow us to create another level 0 table, + * so we create all unmanaged tables as level 1. + * + * To connect them, we use explicit miss rules with + * ignore_flow_level. Caller is responsible to create + * these rules (if needed). + */ + ft_attr.level = 1; + ns = mlx5_get_flow_namespace(esw->dev, MLX5_FLOW_NAMESPACE_FDB); + } + + ft_attr.autogroup.num_reserved_entries = 2; + ft_attr.autogroup.max_num_groups = ESW_OFFLOADS_NUM_GROUPS; fdb = mlx5_create_auto_grouped_flow_table_attr_(ns, &ft_attr); if (IS_ERR(fdb)) { esw_warn(esw->dev, @@ -220,6 +267,7 @@ mlx5_esw_chains_create_fdb_chain(struct mlx5_eswitch *esw, u32 chain) fdb_chain->esw = esw; fdb_chain->chain = chain; + INIT_LIST_HEAD(&fdb_chain->prios_list); err = rhashtable_insert_fast(&esw_chains_ht(esw), &fdb_chain->node, chain_params); @@ -261,6 +309,79 @@ mlx5_esw_chains_get_fdb_chain(struct mlx5_eswitch *esw, u32 chain) return fdb_chain; } +static struct mlx5_flow_handle * +mlx5_esw_chains_add_miss_rule(struct mlx5_flow_table *fdb, + struct mlx5_flow_table *next_fdb) +{ + static const struct mlx5_flow_spec spec = {}; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act act = {}; + + act.flags = FLOW_ACT_IGNORE_FLOW_LEVEL | FLOW_ACT_NO_APPEND; + act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = next_fdb; + + return mlx5_add_flow_rules(fdb, &spec, &act, &dest, 1); +} + +static int +mlx5_esw_chains_update_prio_prevs(struct fdb_prio *fdb_prio, + struct mlx5_flow_table *next_fdb) +{ + struct mlx5_flow_handle *miss_rules[FDB_TC_LEVELS_PER_PRIO + 1] = {}; + struct fdb_chain *fdb_chain = fdb_prio->fdb_chain; + struct fdb_prio *pos; + int n = 0, err; + + if (fdb_prio->key.level) + return 0; + + /* Iterate in reverse order until reaching the level 0 rule of + * the previous priority, adding all the miss rules first, so we can + * revert them if any of them fails. + */ + pos = fdb_prio; + list_for_each_entry_continue_reverse(pos, + &fdb_chain->prios_list, + list) { + miss_rules[n] = mlx5_esw_chains_add_miss_rule(pos->fdb, + next_fdb); + if (IS_ERR(miss_rules[n])) { + err = PTR_ERR(miss_rules[n]); + goto err_prev_rule; + } + + n++; + if (!pos->key.level) + break; + } + + /* Success, delete old miss rules, and update the pointers. */ + n = 0; + pos = fdb_prio; + list_for_each_entry_continue_reverse(pos, + &fdb_chain->prios_list, + list) { + mlx5_del_flow_rules(pos->miss_rule); + + pos->miss_rule = miss_rules[n]; + pos->next_fdb = next_fdb; + + n++; + if (!pos->key.level) + break; + } + + return 0; + +err_prev_rule: + while (--n >= 0) + mlx5_del_flow_rules(miss_rules[n]); + + return err; +} + static void mlx5_esw_chains_put_fdb_chain(struct fdb_chain *fdb_chain) { @@ -272,9 +393,15 @@ static struct fdb_prio * mlx5_esw_chains_create_fdb_prio(struct mlx5_eswitch *esw, u32 chain, u32 prio, u32 level) { + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_handle *miss_rule = NULL; + struct mlx5_flow_group *miss_group; struct fdb_prio *fdb_prio = NULL; + struct mlx5_flow_table *next_fdb; struct fdb_chain *fdb_chain; struct mlx5_flow_table *fdb; + struct list_head *pos; + u32 *flow_group_in; int err; fdb_chain = mlx5_esw_chains_get_fdb_chain(esw, chain); @@ -282,18 +409,65 @@ mlx5_esw_chains_create_fdb_prio(struct mlx5_eswitch *esw, return ERR_CAST(fdb_chain); fdb_prio = kvzalloc(sizeof(*fdb_prio), GFP_KERNEL); - if (!fdb_prio) { + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!fdb_prio || !flow_group_in) { err = -ENOMEM; goto err_alloc; } - fdb = mlx5_esw_chains_create_fdb_table(esw, fdb_chain->chain, prio, - level); + /* Chain's prio list is sorted by prio and level. + * And all levels of some prio point to the next prio's level 0. + * Example list (prio, level): + * (3,0)->(3,1)->(5,0)->(5,1)->(6,1)->(7,0) + * In hardware, we will we have the following pointers: + * (3,0) -> (5,0) -> (7,0) -> Slow path + * (3,1) -> (5,0) + * (5,1) -> (7,0) + * (6,1) -> (7,0) + */ + + /* Default miss for each chain: */ + next_fdb = (chain == mlx5_esw_chains_get_ft_chain(esw)) ? + tc_slow_fdb(esw) : + tc_end_fdb(esw); + list_for_each(pos, &fdb_chain->prios_list) { + struct fdb_prio *p = list_entry(pos, struct fdb_prio, list); + + /* exit on first pos that is larger */ + if (prio < p->key.prio || (prio == p->key.prio && + level < p->key.level)) { + /* Get next level 0 table */ + next_fdb = p->key.level == 0 ? p->fdb : p->next_fdb; + break; + } + } + + fdb = mlx5_esw_chains_create_fdb_table(esw, chain, prio, level); if (IS_ERR(fdb)) { err = PTR_ERR(fdb); goto err_create; } + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, + fdb->max_fte - 2); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + fdb->max_fte - 1); + miss_group = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(miss_group)) { + err = PTR_ERR(miss_group); + goto err_group; + } + + /* Add miss rule to next_fdb */ + miss_rule = mlx5_esw_chains_add_miss_rule(fdb, next_fdb); + if (IS_ERR(miss_rule)) { + err = PTR_ERR(miss_rule); + goto err_miss_rule; + } + + fdb_prio->miss_group = miss_group; + fdb_prio->miss_rule = miss_rule; + fdb_prio->next_fdb = next_fdb; fdb_prio->fdb_chain = fdb_chain; fdb_prio->key.chain = chain; fdb_prio->key.prio = prio; @@ -305,13 +479,30 @@ mlx5_esw_chains_create_fdb_prio(struct mlx5_eswitch *esw, if (err) goto err_insert; + list_add(&fdb_prio->list, pos->prev); + + /* Table is ready, connect it */ + err = mlx5_esw_chains_update_prio_prevs(fdb_prio, fdb); + if (err) + goto err_update; + + kvfree(flow_group_in); return fdb_prio; +err_update: + list_del(&fdb_prio->list); + rhashtable_remove_fast(&esw_prios_ht(esw), &fdb_prio->node, + prio_params); err_insert: + mlx5_del_flow_rules(miss_rule); +err_miss_rule: + mlx5_destroy_flow_group(miss_group); +err_group: mlx5_esw_chains_destroy_fdb_table(esw, fdb); err_create: - kvfree(fdb_prio); err_alloc: + kvfree(fdb_prio); + kvfree(flow_group_in); mlx5_esw_chains_put_fdb_chain(fdb_chain); return ERR_PTR(err); } @@ -322,8 +513,14 @@ mlx5_esw_chains_destroy_fdb_prio(struct mlx5_eswitch *esw, { struct fdb_chain *fdb_chain = fdb_prio->fdb_chain; + WARN_ON(mlx5_esw_chains_update_prio_prevs(fdb_prio, + fdb_prio->next_fdb)); + + list_del(&fdb_prio->list); rhashtable_remove_fast(&esw_prios_ht(esw), &fdb_prio->node, prio_params); + mlx5_del_flow_rules(fdb_prio->miss_rule); + mlx5_destroy_flow_group(fdb_prio->miss_group); mlx5_esw_chains_destroy_fdb_table(esw, fdb_prio->fdb); mlx5_esw_chains_put_fdb_chain(fdb_chain); kvfree(fdb_prio); @@ -415,6 +612,12 @@ mlx5_esw_chains_put_table(struct mlx5_eswitch *esw, u32 chain, u32 prio, chain, prio, level); } +struct mlx5_flow_table * +mlx5_esw_chains_get_tc_end_ft(struct mlx5_eswitch *esw) +{ + return tc_end_fdb(esw); +} + static int mlx5_esw_chains_init(struct mlx5_eswitch *esw) { @@ -484,11 +687,21 @@ mlx5_esw_chains_open(struct mlx5_eswitch *esw) struct mlx5_flow_table *ft; int err; - /* Always open the root for fast path */ - ft = mlx5_esw_chains_get_table(esw, 0, 1, 0); + /* Create tc_end_fdb(esw) which is the always created ft chain */ + ft = mlx5_esw_chains_get_table(esw, mlx5_esw_chains_get_ft_chain(esw), + 1, 0); if (IS_ERR(ft)) return PTR_ERR(ft); + tc_end_fdb(esw) = ft; + + /* Always open the root for fast path */ + ft = mlx5_esw_chains_get_table(esw, 0, 1, 0); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto level_0_err; + } + /* Open level 1 for split rules now if prios isn't supported */ if (!mlx5_esw_chains_prios_supported(esw)) { ft = mlx5_esw_chains_get_table(esw, 0, 1, 1); @@ -503,6 +716,8 @@ mlx5_esw_chains_open(struct mlx5_eswitch *esw) level_1_err: mlx5_esw_chains_put_table(esw, 0, 1, 0); +level_0_err: + mlx5_esw_chains_put_table(esw, mlx5_esw_chains_get_ft_chain(esw), 1, 0); return err; } @@ -512,6 +727,7 @@ mlx5_esw_chains_close(struct mlx5_eswitch *esw) if (!mlx5_esw_chains_prios_supported(esw)) mlx5_esw_chains_put_table(esw, 0, 1, 1); mlx5_esw_chains_put_table(esw, 0, 1, 0); + mlx5_esw_chains_put_table(esw, mlx5_esw_chains_get_ft_chain(esw), 1, 0); } int diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.h index 52fadacab84d..2e13097fe348 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.h @@ -20,6 +20,9 @@ void mlx5_esw_chains_put_table(struct mlx5_eswitch *esw, u32 chain, u32 prio, u32 level); +struct mlx5_flow_table * +mlx5_esw_chains_get_tc_end_ft(struct mlx5_eswitch *esw); + int mlx5_esw_chains_create(struct mlx5_eswitch *esw); void mlx5_esw_chains_destroy(struct mlx5_eswitch *esw); -- 2.13.6