Blame SOURCES/rhbz1643997.0002-stapbpf-assembler-WIP-1-basic-parser-and-control-flo.patch

132810
From 17d4495bef5c3878bb38730ff0d849415b52641a Mon Sep 17 00:00:00 2001
132810
From: Serhei Makarov <smakarov@redhat.com>
132810
Date: Mon, 1 Oct 2018 15:38:16 -0400
132810
Subject: [PATCH 02/32] stapbpf assembler WIP #1 :: basic parser and control
132810
 flow
132810
132810
---
132810
 bpf-internal.h    |   7 +-
132810
 bpf-opt.cxx       |   2 +-
132810
 bpf-translate.cxx | 745 +++++++++++++++++++++++++++++++++++++++++++-----------
132810
 parse.h           |  14 +
132810
 4 files changed, 619 insertions(+), 149 deletions(-)
132810
132810
diff --git a/bpf-internal.h b/bpf-internal.h
132810
index 17a033533..719446db8 100644
132810
--- a/bpf-internal.h
132810
+++ b/bpf-internal.h
132810
@@ -261,9 +261,10 @@ struct program
132810
   void print(std::ostream &) const;
132810
 };
132810
 
132810
-// ??? Properly belongs to bpf_unparser but must be accessible from bpf-opt.cxx:
132810
-value *emit_literal_str(program &this_prog, insn_inserter &this_ins,
132810
-                        value *dest, int ofs, std::string &src, bool zero_pad = false);
132810
+// ??? Properly belongs to bpf_unparser but must be visible from bpf-opt.cxx:
132810
+value *emit_simple_literal_str(program &this_prog, insn_inserter &this_ins,
132810
+                               value *dest, int ofs, std::string &src,
132810
+                               bool zero_pad = false);
132810
 
132810
 inline std::ostream&
132810
 operator<< (std::ostream &o, const program &c)
132810
diff --git a/bpf-opt.cxx b/bpf-opt.cxx
132810
index 0f64d826d..c2e30a690 100644
132810
--- a/bpf-opt.cxx
132810
+++ b/bpf-opt.cxx
132810
@@ -41,7 +41,7 @@ alloc_literal_str(program &p, insn_inserter &ins, std::string &str)
132810
   int ofs = -tmp_space;
132810
 
132810
   value *frame = p.lookup_reg(BPF_REG_10);
132810
-  value *out = emit_literal_str(p, ins, frame, ofs, str, false /* don't zero pad */);
132810
+  value *out = emit_simple_literal_str(p, ins, frame, ofs, str, false /* don't zero pad */);
132810
   return out;
132810
 }
132810
 
132810
diff --git a/bpf-translate.cxx b/bpf-translate.cxx
132810
index d848c9f16..023ac6ce7 100644
132810
--- a/bpf-translate.cxx
132810
+++ b/bpf-translate.cxx
132810
@@ -8,6 +8,7 @@
132810
 
132810
 #include "config.h"
132810
 #include "bpf-internal.h"
132810
+#include "parse.h"
132810
 #include "staptree.h"
132810
 #include "elaborate.h"
132810
 #include "session.h"
132810
@@ -134,6 +135,9 @@ has_side_effects (expression *e)
132810
   return t.side_effects;
132810
 }
132810
 
132810
+/* forward declaration */
132810
+struct asm_stmt;
132810
+
132810
 struct bpf_unparser : public throwing_visitor
132810
 {
132810
   // The visitor class isn't as helpful as it might be.  As a consequence,
132810
@@ -233,10 +237,19 @@ struct bpf_unparser : public throwing_visitor
132810
   value *emit_expr(expression *e);
132810
   value *emit_bool(expression *e);
132810
   value *emit_context_var(bpf_context_vardecl *v);
132810
-  value *parse_reg(const std::string &str, embeddedcode *s);
132810
 
132810
-  // Used for copying string data:
132810
-  value *emit_copied_str(value *dest, int ofs, value *src, bool zero_pad = false);
132810
+  // Used for the embedded-code assembler:
132810
+  size_t parse_asm_stmt (embeddedcode *s, size_t start,
132810
+                           /*OUT*/asm_stmt &stmt);
132810
+  value *emit_asm_arg(const asm_stmt &stmt, const std::string &reg,
132810
+                      bool allow_imm = true);
132810
+  value *emit_asm_reg(const asm_stmt &stmt, const std::string ®);
132810
+  void emit_asm_opcode(const asm_stmt &stmt,
132810
+                       std::map<std::string, block *> label_map);
132810
+
132810
+  // Used for string data:
132810
+  value *emit_literal_string(const std::string &str, const token *tok);
132810
+  value *emit_string_copy(value *dest, int ofs, value *src, bool zero_pad = false);
132810
 
132810
   // Used for passing long and string arguments on the stack where an address is expected:
132810
   void emit_long_arg(value *arg, int ofs, value *val);
132810
@@ -552,172 +565,604 @@ bpf_unparser::visit_block (::block *s)
132810
     emit_stmt (s->statements[i]);
132810
 }
132810
 
132810
+/* WORK IN PROGRESS: A simple eBPF assembler.
132810
+
132810
+   In order to effectively write eBPF tapset functions, we want to use
132810
+   embedded-code assembly rather than compile from SystemTap code. At
132810
+   the same time, we want to hook into stapbpf functionality to
132810
+   reserve stack memory, allocate virtual registers or signal errors.
132810
+
132810
+   The assembler syntax will probably take a couple of attempts to get
132810
+   just right. This attempt keeps things as close as possible to the
132810
+   first embedded-code assembler, with a few more features and a
132810
+   disgustingly lenient parser that allows things like
132810
+     $ this is        all one "**identifier**" believe-it!-or-not
132810
+
132810
+   Ahh for the days of 1960s FORTRAN.
132810
+
132810
+   TODO: It might make more sense to implement an assembler based on
132810
+   the syntax used in official eBPF subsystem docs. */
132810
+
132810
+/* Possible assembly statement types include:
132810
+
132810
+   <stmt> ::= label, <dest=label>;
132810
+   <stmt> ::= <code=integer opcode>, <dest=reg>, <src1=reg>,
132810
+              <off/jmp_target=off>, <imm=imm>;
132810
+
132810
+   Possible argument types include:
132810
+
132810
+   <reg> ::= <register index> | r<register index> |
132810
+             $<identifier> | $<integer constant> | $$ | <string constant>
132810
+   <imm> ::= <integer constant> | BPF_MAXSTRINGLEN
132810
+   <off> ::= <imm> | <jump label>
132810
+
132810
+*/
132810
+
132810
+struct asm_stmt {
132810
+  std::string kind;
132810
+
132810
+  unsigned code;
132810
+  std::string dest, src1;
132810
+  int64_t off, imm;
132810
+
132810
+  // metadata for jmp instructions
132810
+  bool has_fallthrough = false;
132810
+  std::string jmp_target, fallthrough;
132810
+
132810
+  token *tok;
132810
+  bool deallocate_tok = false;
132810
+  ~asm_stmt() { if (deallocate_tok) delete tok; }
132810
+};
132810
+
132810
+std::ostream&
132810
+operator << (std::ostream& o, const asm_stmt& stmt)
132810
+{
132810
+  if (stmt.kind == "label")
132810
+    o << "label, " << stmt.dest << ";";
132810
+  else if (stmt.kind == "opcode")
132810
+    {
132810
+      o << std::hex << stmt.code << ", "
132810
+        << stmt.dest << ", "
132810
+        << stmt.src1 << ", ";
132810
+      if (stmt.off != 0 || stmt.jmp_target == "")
132810
+        o << stmt.off;
132810
+      else if (stmt.off != 0) // && stmt.jmp_target != ""
132810
+        o << stmt.off << "/";
132810
+      if (stmt.jmp_target != "")
132810
+        o << "label:" << stmt.jmp_target;
132810
+      o << ", "
132810
+        << stmt.imm << ";"
132810
+        << (stmt.has_fallthrough ? " +FALLTHROUGH " + stmt.fallthrough : "");
132810
+    }
132810
+  else
132810
+    o << "<unknown asm_stmt kind '" << stmt.kind << "'>";
132810
+  return o;
132810
+}
132810
+
132810
+bool
132810
+is_numeric (const std::string &str)
132810
+{
132810
+  size_t pos = 0;
132810
+  try {
132810
+    stol(str, &pos, 0);
132810
+  } catch (std::invalid_argument &e) {
132810
+    return false;
132810
+  }
132810
+  return (pos == str.size());
132810
+}
132810
+
132810
+/* Parse an assembly statement starting from position start in code,
132810
+   then write the output in stmt. Returns a position immediately after
132810
+   the parsed statement. */
132810
+size_t
132810
+bpf_unparser::parse_asm_stmt (embeddedcode *s, size_t start,
132810
+                              /*OUT*/asm_stmt &stmt)
132810
+{
132810
+  const interned_string &code = s->code;
132810
+
132810
+ retry:
132810
+  std::vector<std::string> args;
132810
+  unsigned n = code.size();
132810
+  bool in_comment = false;
132810
+  bool in_string = false;
132810
+
132810
+  // compute token with adjusted source location for diagnostics
132810
+  source_loc adjusted_loc; // TODO: ought to create a proper copy constructor for source_loc
132810
+  adjusted_loc.file = s->tok->location.file;
132810
+  adjusted_loc.line = s->tok->location.line;
132810
+  adjusted_loc.column = s->tok->location.column;
132810
+  for (size_t pos = 0; pos < start && pos < n; pos++)
132810
+    {
132810
+      // TODO: should save adjusted_loc state between parse_asm_stmt invocations; add field?
132810
+      char c = code[pos];
132810
+      if (c == '\n')
132810
+        {
132810
+          adjusted_loc.line++;
132810
+          adjusted_loc.column = 1;
132810
+        }
132810
+      else
132810
+        adjusted_loc.column++;
132810
+    }
132810
+
132810
+  // TODO: As before, parser is extremely non-rigorous and could do
132810
+  // with some tightening in terms of the inputs it accepts.
132810
+  size_t pos;
132810
+  std::string arg = "";
132810
+  for (pos = start; pos < n; pos++)
132810
+  {
132810
+    char c = code[pos];
132810
+    char c2 = pos + 1 < n ? code [pos + 1] : 0;
132810
+    if (isspace(c))
132810
+      continue; // skip
132810
+    else if (in_comment)
132810
+      {
132810
+        if (c == '*' && c2 == '/')
132810
+          ++pos, in_comment = false;
132810
+        // else skip
132810
+      }
132810
+    else if (in_string)
132810
+      {
132810
+        // resulting string will be processed by translate_escapes()
132810
+        if (c == '"')
132810
+          arg.push_back(c), in_string = false; // include quote
132810
+        else if (c == '\\' && c2 == '"')
132810
+          ++pos, arg.push_back(c), arg.push_back(c2);
132810
+        else // accept any char, including whitespace
132810
+          arg.push_back(c);
132810
+      }
132810
+    else if (c == '/' && c2 == '*')
132810
+      ++pos, in_comment = true;
132810
+    else if (c == '"') // found a literal string
132810
+      {
132810
+        // XXX: This allows '"' inside an arg and will treat the
132810
+        // string as a sequence of weird identifier characters.  A
132810
+        // more rigorous parser would error on mixing strings and
132810
+        // regular chars.
132810
+        arg.push_back(c); // include quote
132810
+        in_string = true;
132810
+      }
132810
+    else if (c == ',') // reached end of argument
132810
+      {
132810
+        // XXX: This strips out empty args. A more rigorous parser would error.
132810
+        if (arg != "")
132810
+          args.push_back(arg);
132810
+        arg = "";
132810
+      }
132810
+    else if (c == ';') // reached end of statement
132810
+      {
132810
+        // XXX: This strips out empty args. A more rigorous parser would error.
132810
+        if (arg != "")
132810
+          args.push_back(arg);
132810
+        arg = "";
132810
+        pos++; break;
132810
+      }
132810
+    else // found (we assume) a regular char
132810
+      {
132810
+        // XXX: As before, this strips whitespace within args
132810
+        // (so '$ab', '$ a b' and '$a b' are equivalent).
132810
+        //
132810
+        // A more rigorous parser would track in_arg
132810
+        // and after_arg states and error on whitespace within args.
132810
+        arg.push_back(c);
132810
+      }
132810
+  }
132810
+  // final ';' is optional, so we watch for a trailing arg:
132810
+  if (arg != "") args.push_back(arg);
132810
+
132810
+  // handle the case with no args
132810
+  if (args.empty() && pos >= n)
132810
+    return std::string::npos; // finished parsing
132810
+  else if (args.empty())
132810
+    {
132810
+      // XXX: This skips an empty statement.
132810
+      // A more rigorous parser would error.
132810
+      start = pos;
132810
+      goto retry;
132810
+    }
132810
+
132810
+  // set token with adjusted source location
132810
+  //stmt.tok = (token *)s->tok;
132810
+  // TODO this segfaults for some reason, some data not copied?
132810
+  stmt.tok = s->tok->adjust_location(adjusted_loc);
132810
+  stmt.deallocate_tok = false; // TODO must avoid destroy-on-copy
132810
+
132810
+  std::cerr << "DEBUG GOT stmt "; // TODO
132810
+  for (unsigned k = 0; k < args.size(); k++) std::cerr << args[k] << " / ";
132810
+  std::cerr << std::endl; // TODO
132810
+  if (args[0] == "label")
132810
+    {
132810
+      if (args.size() != 2)
132810
+        throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), stmt.tok);
132810
+      stmt.kind = args[0];
132810
+      stmt.dest = args[1];
132810
+    }
132810
+  else if (is_numeric(args[0]))
132810
+    {
132810
+      if (args.size() != 5) // TODO change to 4 to test err+tok
132810
+        throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), stmt.tok);
132810
+      stmt.kind = "opcode";
132810
+      stmt.code = stoul(args[0], 0, 0); // TODO signal error
132810
+      stmt.dest = args[1];
132810
+      stmt.src1 = args[2];
132810
+
132810
+      bool has_jmp_target =
132810
+        BPF_CLASS(stmt.code) == BPF_JMP
132810
+        && BPF_OP(stmt.code) != BPF_EXIT
132810
+        && BPF_OP(stmt.code) != BPF_CALL;
132810
+      stmt.has_fallthrough = // only for jcond
132810
+        has_jmp_target
132810
+        && BPF_OP(stmt.code) != BPF_JA;
132810
+      // XXX: stmt.fallthrough is computed by visit_embeddedcode
132810
+
132810
+      if (has_jmp_target)
132810
+        {
132810
+          stmt.off = 0;
132810
+          stmt.jmp_target = args[3];
132810
+        }
132810
+      else if (args[3] == "BPF_MAXSTRINGLEN")
132810
+        stmt.off = BPF_MAXSTRINGLEN;
132810
+      else if (args[3] == "-")
132810
+        stmt.off = 0;
132810
+      else
132810
+        stmt.off = stol(args[3]); // TODO signal error
132810
+
132810
+      if (args[4] == "BPF_MAXSTRINGLEN")
132810
+        stmt.imm = BPF_MAXSTRINGLEN;
132810
+      else if (args[4] == "-")
132810
+        stmt.imm = 0;
132810
+      else
132810
+        stmt.imm = stol(args[4]); // TODO signal error
132810
+    }
132810
+  else
132810
+    throw SEMANTIC_ERROR (_F("unknown bpf embeddedcode operator '%s'",
132810
+                             args[0].c_str()), stmt.tok);
132810
+
132810
+  // we returned a statement, so there's more parsing to be done
132810
+  return pos;
132810
+}
132810
+
132810
+/* forward declaration */
132810
+std::string translate_escapes (const interned_string &str);
132810
+
132810
+/* Convert a <reg> or <imm> operand to a value.
132810
+   May emit code to store a string constant on the stack. */
132810
 value *
132810
-bpf_unparser::parse_reg(const std::string &str, embeddedcode *s)
132810
+bpf_unparser::emit_asm_arg (const asm_stmt &stmt, const std::string &arg,
132810
+                            bool allow_imm)
132810
 {
132810
-  if (str == "$$")
132810
+  if (arg == "$$")
132810
     {
132810
-      if (func_return.empty ())
132810
-	throw SEMANTIC_ERROR (_("no return value outside function"), s->tok);
132810
+      /* arg is a return value */
132810
+      if (func_return.empty())
132810
+        throw SEMANTIC_ERROR (_("no return value outside function"), stmt.tok);
132810
       return func_return_val.back();
132810
     }
132810
-  else if (str[0] == '$')
132810
+  else if (arg[0] == '$')
132810
     {
132810
-      std::string var = str.substr(1);
132810
+      /* assume arg is a variable */
132810
+      std::string var = arg.substr(1);
132810
       for (auto i = this_locals->begin(); i != this_locals->end(); ++i)
132810
 	{
132810
 	  vardecl *v = i->first;
132810
 	  if (var == v->unmangled_name)
132810
 	    return i->second;
132810
 	}
132810
-      throw SEMANTIC_ERROR (_("unknown variable"), s->tok);
132810
+
132810
+      /* if it's an unknown variable, allocate a temporary */
132810
+      struct vardecl *vd = new vardecl;
132810
+      vd->name = "__bpfasm__local_" + var;
132810
+      vd->unmangled_name = var;
132810
+      vd->type = pe_long;
132810
+      vd->arity = 0;
132810
+      value *reg = this_prog.new_reg();
132810
+      const locals_map::value_type v (vd, reg);
132810
+      auto ok = this_locals->insert (v);
132810
+      assert (ok.second);
132810
+      return reg;
132810
+      // TODO write a testcase
132810
     }
132810
-  else
132810
+  else if (is_numeric(arg) && allow_imm)
132810
     {
132810
-      unsigned long num = stoul(str, 0, 0);
132810
+      /* arg is an immediate constant */
132810
+      long imm = stol(arg, 0, 0);
132810
+      return this_prog.new_imm(imm);
132810
+    }
132810
+  else if (is_numeric(arg) || arg[0] == 'r')
132810
+    {
132810
+      /* arg is a register number */
132810
+      std::string reg = arg[0] == 'r' ? arg.substr(1) : arg;
132810
+      unsigned long num = stoul(reg, 0, 0);
132810
       if (num > 10)
132810
-	throw SEMANTIC_ERROR (_("invalid bpf register"), s->tok);
132810
+	throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
132810
+                                 arg.c_str()), stmt.tok);
132810
       return this_prog.lookup_reg(num);
132810
     }
132810
+  else if (arg[0] == '"')
132810
+    {
132810
+      // TODO verify correctness
132810
+      /* arg is a string constant */
132810
+      if (arg[arg.size() - 1] != '"')
132810
+        throw SEMANTIC_ERROR (_F("BUG: improper string %s",
132810
+                                 arg.c_str()), stmt.tok);
132810
+      std::string escaped_str = arg.substr(1,arg.size()-2); /* strip quotes */
132810
+      std::string str = translate_escapes(escaped_str); // TODO interned_str?
132810
+      return emit_literal_string(str, stmt.tok);
132810
+    }
132810
+  else if (arg == "BPF_MAXSTRINGLEN")
132810
+    {
132810
+      /* arg is BPF_MAXSTRINGLEN */
132810
+      if (!allow_imm)
132810
+        throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
132810
+                                 arg.c_str()), stmt.tok);
132810
+      return this_prog.new_imm(BPF_MAXSTRINGLEN);
132810
+    }
132810
+  else if (arg == "-")
132810
+    {
132810
+      /* arg is null a.k.a '0' */
132810
+      if (!allow_imm)
132810
+        throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
132810
+                                 arg.c_str()), stmt.tok);
132810
+      return this_prog.new_imm(0);
132810
+    }
132810
+  else if (allow_imm)
132810
+    throw SEMANTIC_ERROR (_F("invalid bpf argument '%s'",
132810
+                             arg.c_str()), stmt.tok);
132810
+  else
132810
+    throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
132810
+                             arg.c_str()), stmt.tok);
132810
+  
132810
+}
132810
+
132810
+value *
132810
+bpf_unparser::emit_asm_reg (const asm_stmt &stmt, const std::string &reg)
132810
+{
132810
+  return emit_asm_arg(stmt, reg, /*allow_imm=*/false);
132810
 }
132810
 
132810
 void
132810
-bpf_unparser::visit_embeddedcode (embeddedcode *s)
132810
+bpf_unparser::emit_asm_opcode (const asm_stmt &stmt,
132810
+                               std::map<std::string, block *> label_map)
132810
 {
132810
-  std::string strip;
132810
-  {
132810
-    const interned_string &code = s->code;
132810
-    unsigned n = code.size();
132810
-    bool in_comment = false;
132810
+  if (stmt.code > 0xff && stmt.code != BPF_LD_MAP)
132810
+    throw SEMANTIC_ERROR (_("invalid bpf code"), stmt.tok);
132810
 
132810
-    for (unsigned i = 0; i < n; ++i)
132810
-      {
132810
-	char c = code[i];
132810
-	if (isspace(c))
132810
-	  continue;
132810
-	if (in_comment)
132810
-	  {
132810
-	    if (c == '*' && code[i + 1] == '/')
132810
-	      ++i, in_comment = false;
132810
-	  }
132810
-	else if (c == '/' && code[i + 1] == '*')
132810
-	  ++i, in_comment = true;
132810
-	else
132810
-	  strip += c;
132810
-      }
132810
-  }
132810
+  bool r_dest = false, r_src0 = false, r_src1 = false, i_src1 = false;
132810
+  bool op_jmp = false, op_jcond = false; condition c;
132810
+  switch (BPF_CLASS (stmt.code))
132810
+    {
132810
+    case BPF_LDX:
132810
+      r_dest = r_src1 = true;
132810
+      break;
132810
+    case BPF_STX:
132810
+      r_src0 = r_src1 = true;
132810
+      break;
132810
+    case BPF_ST:
132810
+      r_src0 = i_src1 = true;
132810
+      break;
132810
+
132810
+    case BPF_ALU:
132810
+    case BPF_ALU64:
132810
+      r_dest = true;
132810
+      if (stmt.code & BPF_X)
132810
+        r_src1 = true;
132810
+      else
132810
+        i_src1 = true;
132810
+      switch (BPF_OP (stmt.code))
132810
+        {
132810
+        case BPF_NEG:
132810
+        case BPF_MOV:
132810
+          break;
132810
+        case BPF_END:
132810
+          /* X/K bit repurposed as LE/BE.  */
132810
+          i_src1 = false, r_src1 = true;
132810
+          break;
132810
+        default:
132810
+          r_src0 = true;
132810
+        }
132810
+      break;
132810
+
132810
+    case BPF_JMP:
132810
+      switch (BPF_OP (stmt.code))
132810
+        {
132810
+        case BPF_EXIT:
132810
+          // no special treatment needed
132810
+          break;
132810
+        case BPF_CALL:
132810
+          i_src1 = true;
132810
+          break;
132810
+        case BPF_JA:
132810
+          op_jmp = true;
132810
+          break;
132810
+        default:
132810
+          // XXX: assume this is a jcond op
132810
+          op_jcond = true;
132810
+          r_src0 = true;
132810
+          if (stmt.code & BPF_X)
132810
+            r_src1 = true;
132810
+          else
132810
+            i_src1 = true;
132810
+        }
132810
+
132810
+      // compute jump condition c
132810
+      switch (BPF_OP (stmt.code))
132810
+        {
132810
+        case BPF_JEQ: c = EQ; break;
132810
+        case BPF_JNE: c = NE; break;
132810
+        case BPF_JGT: c = GTU; break;
132810
+        case BPF_JGE: c = GEU; break;
132810
+        case BPF_JLT: c = LTU; break;
132810
+        case BPF_JLE: c = LEU; break;
132810
+        case BPF_JSGT: c = GT; break;
132810
+        case BPF_JSGE: c = GE; break;
132810
+        case BPF_JSLT: c = LT; break;
132810
+        case BPF_JSLE: c = LE; break;
132810
+        case BPF_JSET: c = TEST; break;
132810
+        default:
132810
+          if (op_jcond)
132810
+            throw SEMANTIC_ERROR (_("invalid branch in bpf code"), stmt.tok);
132810
+        }
132810
+      break;
132810
+
132810
+    default:
132810
+      if (stmt.code == BPF_LD_MAP)
132810
+        r_dest = true, i_src1 = true;
132810
+      else
132810
+        throw SEMANTIC_ERROR (_F("unknown opcode '%d' in bpf code",
132810
+                                stmt.code), stmt.tok);
132810
+    }
132810
 
132810
-  std::istringstream ii (strip);
132810
-  ii >> std::setbase(0);
132810
+  value *v_dest = NULL;
132810
+  if (r_dest || r_src0)
132810
+    v_dest = emit_asm_reg(stmt, stmt.dest);
132810
+  else if (stmt.dest != "0" && stmt.dest != "-")
132810
+    throw SEMANTIC_ERROR (_F("invalid register field '%s' in bpf code",
132810
+                             stmt.dest.c_str()), stmt.tok);
132810
 
132810
-  while (true)
132810
+  value *v_src1 = NULL;
132810
+  if (r_src1)
132810
+    v_src1 = emit_asm_reg(stmt, stmt.src1);
132810
+  else
132810
     {
132810
-      unsigned code;
132810
-      char s1, s2, s3, s4;
132810
-      char dest_b[256], src1_b[256];
132810
-      int64_t off, imm;
132810
+      if (stmt.src1 != "0" && stmt.src1 != "-")
132810
+        throw SEMANTIC_ERROR (_F("invalid register field '%s' in bpf code",
132810
+                                 stmt.src1.c_str()), stmt.tok);
132810
+      if (i_src1)
132810
+        v_src1 = this_prog.new_imm(stmt.imm);
132810
+      else if (stmt.imm != 0)
132810
+        throw SEMANTIC_ERROR (_("invalid immediate field in bpf code"), stmt.tok);
132810
+    }
132810
 
132810
-      ii >> code >> s1;
132810
-      ii.get(dest_b, sizeof(dest_b), ',') >> s2;
132810
-      ii.get(src1_b, sizeof(src1_b), ',') >> s3;
132810
-      ii >> off >> s4 >> imm;
132810
+  if (stmt.off != (int16_t)stmt.off)
132810
+    throw SEMANTIC_ERROR (_F("offset field '%ld' out of range in bpf code", stmt.off), stmt.tok);
132810
 
132810
-      if (ii.fail() || s1 != ',' || s2 != ',' || s3 != ',' || s4 != ',')
132810
-	throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), s->tok);
132810
+  if (op_jmp)
132810
+    {
132810
+      block *target = label_map[stmt.jmp_target];
132810
+      this_prog.mk_jmp(this_ins, target);
132810
+    }
132810
+  else if (op_jcond)
132810
+    {
132810
+      if (label_map.count(stmt.jmp_target) == 0)
132810
+        throw SEMANTIC_ERROR(_F("undefined jump target '%s' in bpf code",
132810
+                                stmt.jmp_target.c_str()), stmt.tok);
132810
+      if (label_map.count(stmt.fallthrough) == 0)
132810
+        throw SEMANTIC_ERROR(_F("BUG: undefined fallthrough target '%s'",
132810
+                                stmt.fallthrough.c_str()), stmt.tok);
132810
+      block *target = label_map[stmt.jmp_target];
132810
+      block *fallthrough = label_map[stmt.fallthrough];
132810
+      this_prog.mk_jcond(this_ins, c, v_dest, v_src1, target, fallthrough);
132810
+    }
132810
+  else // regular opcode
132810
+    {
132810
+      insn *i = this_ins.new_insn();
132810
+      i->code = stmt.code;
132810
+      i->dest = (r_dest ? v_dest : NULL);
132810
+      i->src0 = (r_src0 ? v_dest : NULL);
132810
+      i->src1 = v_src1;
132810
+      i->off = stmt.off;
132810
+    }
132810
+}
132810
 
132810
-      if (code > 0xff && code != BPF_LD_MAP)
132810
-	throw SEMANTIC_ERROR (_("invalid bpf code"), s->tok);
132810
+void
132810
+bpf_unparser::visit_embeddedcode (embeddedcode *s)
132810
+{
132810
+  std::vector<asm_stmt> statements;
132810
+  asm_stmt stmt;
132810
 
132810
-      bool r_dest = false, r_src0 = false, r_src1 = false, i_src1 = false;
132810
-      switch (BPF_CLASS (code))
132810
-	{
132810
-	case BPF_LDX:
132810
-	  r_dest = r_src1 = true;
132810
-	  break;
132810
-	case BPF_STX:
132810
-	  r_src0 = r_src1 = true;
132810
-	  break;
132810
-	case BPF_ST:
132810
-	  r_src0 = i_src1 = true;
132810
-	  break;
132810
+  size_t pos = 0;
132810
+  while ((pos = parse_asm_stmt(s, pos, stmt)) != std::string::npos)
132810
+    {
132810
+      statements.push_back(stmt);
132810
+    }
132810
 
132810
-	case BPF_ALU:
132810
-	case BPF_ALU64:
132810
-	  r_dest = true;
132810
-	  if (code & BPF_X)
132810
-	    r_src1 = true;
132810
-	  else
132810
-	    i_src1 = true;
132810
-	  switch (BPF_OP (code))
132810
-	    {
132810
-	    case BPF_NEG:
132810
-	    case BPF_MOV:
132810
-	      break;
132810
-	    case BPF_END:
132810
-	      /* X/K bit repurposed as LE/BE.  */
132810
-	      i_src1 = false, r_src1 = true;
132810
-	      break;
132810
-	    default:
132810
-	      r_src0 = true;
132810
-	    }
132810
-	  break;
132810
+  // build basic block table
132810
+  std::map<std::string, block *> label_map;
132810
+  block *entry_block = this_ins.b;
132810
+  label_map[";;entry"] = entry_block;
132810
 
132810
-	case BPF_JMP:
132810
-	  switch (BPF_OP (code))
132810
-	    {
132810
-	    case BPF_EXIT:
132810
-	      break;
132810
-	    case BPF_CALL:
132810
-	      i_src1 = true;
132810
-	      break;
132810
-	    default:
132810
-	      throw SEMANTIC_ERROR (_("invalid branch in bpf code"), s->tok);
132810
-	    }
132810
-	  break;
132810
+  bool after_label = true;
132810
+  asm_stmt *after_jump = NULL;
132810
+  unsigned fallthrough_count = 0;
132810
+  for (std::vector<asm_stmt>::iterator it = statements.begin();
132810
+       it != statements.end(); it++)
132810
+    {
132810
+      stmt = *it;
132810
 
132810
-	default:
132810
-          if (code == BPF_LD_MAP)
132810
-            r_dest = true, i_src1 = true;
132810
-          else
132810
-	    throw SEMANTIC_ERROR (_("unknown opcode in bpf code"), s->tok);
132810
-	}
132810
+      if (after_jump != NULL && stmt.kind == "label")
132810
+        {
132810
+          after_jump->fallthrough = stmt.dest;
132810
+        }
132810
+      else if (after_jump != NULL)
132810
+        {
132810
+          block *b = this_prog.new_block();
132810
 
132810
-      std::string dest(dest_b);
132810
-      value *v_dest = NULL;
132810
-      if (r_dest || r_src0)
132810
-	v_dest = parse_reg(dest, s);
132810
-      else if (dest != "0")
132810
-	throw SEMANTIC_ERROR (_("invalid register field in bpf code"), s->tok);
132810
-
132810
-      std::string src1(src1_b);
132810
-      value *v_src1 = NULL;
132810
-      if (r_src1)
132810
-	v_src1 = parse_reg(src1, s);
132810
-      else
132810
-	{
132810
-	  if (src1 != "0")
132810
-	    throw SEMANTIC_ERROR (_("invalid register field in bpf code"), s->tok);
132810
-	  if (i_src1)
132810
-	    v_src1 = this_prog.new_imm(imm);
132810
-	  else if (imm != 0)
132810
-	    throw SEMANTIC_ERROR (_("invalid immediate field in bpf code"), s->tok);
132810
-	}
132810
+          // generate unique label for fallthrough edge
132810
+          std::ostringstream oss;
132810
+          oss << "fallthrough;;" << fallthrough_count++;
132810
+          std::string fallthrough_label = oss.str();
132810
+          // XXX: semicolons prevent collision with programmer-defined labels
132810
 
132810
-      if (off != (int16_t)off)
132810
-	throw SEMANTIC_ERROR (_("offset field out of range in bpf code"), s->tok);
132810
+          label_map[fallthrough_label] = b;
132810
+          set_block(b);
132810
 
132810
-      insn *i = this_ins.new_insn();
132810
-      i->code = code;
132810
-      i->dest = (r_dest ? v_dest : NULL);
132810
-      i->src0 = (r_src0 ? v_dest : NULL);
132810
-      i->src1 = v_src1;
132810
-      i->off = off;
132810
+          after_jump->fallthrough = fallthrough_label;
132810
+        }
132810
 
132810
-      ii >> s1;
132810
-      if (ii.eof())
132810
-	break;
132810
-      if (s1 != ';')
132810
-	throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), s->tok);
132810
+      if (stmt.kind == "label" && after_label)
132810
+        {
132810
+          // avoid creating multiple blocks for consecutive labels
132810
+          label_map[stmt.dest] = this_ins.b;
132810
+          after_jump = NULL;
132810
+        }
132810
+      else if (stmt.kind == "label")
132810
+        {
132810
+          block *b = this_prog.new_block();
132810
+          label_map[stmt.dest] = b;
132810
+          set_block(b);
132810
+          after_label = true;
132810
+          after_jump = NULL;
132810
+        }
132810
+      else if (stmt.has_fallthrough)
132810
+        {
132810
+          after_label = false;
132810
+          after_jump = &*it; // be sure to refer to original, not copied stmt
132810
+        }
132810
+      else
132810
+        {
132810
+          after_label = false;
132810
+          after_jump = NULL;
132810
+        }
132810
+    }
132810
+  if (after_jump != NULL) // TODO: should just fall through to exit
132810
+    throw SEMANTIC_ERROR (_("BUG: bpf embeddedcode doesn't support "
132810
+                            "fallthrough on final asm_stmt"), stmt.tok);
132810
+
132810
+  // emit statements
132810
+  bool jumped_already = true;
132810
+  set_block(entry_block);
132810
+  for (std::vector<asm_stmt>::iterator it = statements.begin();
132810
+       it != statements.end(); it++)
132810
+    {
132810
+      stmt = *it;
132810
+      std::cerr << "DEBUG processing " << stmt << std::endl; // TODO
132810
+      if (stmt.kind == "label")
132810
+        {
132810
+          // TODO: be sure there's no gap in the edge
132810
+          if (!jumped_already)
132810
+            emit_jmp (label_map[stmt.dest]);
132810
+          set_block(label_map[stmt.dest]);
132810
+        }
132810
+      else if (stmt.kind == "opcode")
132810
+        {
132810
+          emit_asm_opcode (stmt, label_map);
132810
+        }
132810
+      else
132810
+        throw SEMANTIC_ERROR (_F("BUG: bpf embeddedcode contains unexpected "
132810
+                                 "asm_stmt kind '%s'", stmt.kind.c_str()),
132810
+                              stmt.tok);
132810
+      jumped_already = stmt.has_fallthrough;
132810
+      if (stmt.has_fallthrough)
132810
+        set_block(label_map[stmt.fallthrough]);
132810
     }
132810
 }
132810
 
132810
@@ -1016,8 +1461,13 @@ bpf_unparser::visit_delete_statement (delete_statement *s)
132810
 }
132810
 
132810
 // Translate string escape characters.
132810
+// Accepts strings produced by parse.cxx lexer::scan and
132810
+// by the eBPF embedded-code assembler.
132810
+//
132810
+// PR23559: This is currently an eBPF-only version of the function
132810
+// that does not translate octal escapes.
132810
 std::string
132810
-translate_escapes (interned_string &str)
132810
+translate_escapes (const interned_string &str)
132810
 {
132810
   std::string result;
132810
   bool saw_esc = false;
132810
@@ -1045,16 +1495,21 @@ translate_escapes (interned_string &str)
132810
   return result;
132810
 }
132810
 
132810
+value *
132810
+bpf_unparser::emit_literal_string (const std::string &str, const token *tok)
132810
+{
132810
+  size_t str_bytes = str.size() + 1;
132810
+  if (str_bytes > BPF_MAXSTRINGLEN)
132810
+    throw SEMANTIC_ERROR(_("string literal too long"), tok);
132810
+  return this_prog.new_str(str); // will be lowered to a pointer by bpf-opt.cxx
132810
+}
132810
+
132810
 void
132810
 bpf_unparser::visit_literal_string (literal_string* e)
132810
 {
132810
   interned_string v = e->value;
132810
   std::string str = translate_escapes(v);
132810
-
132810
-  size_t str_bytes = str.size() + 1;
132810
-  if (str_bytes > BPF_MAXSTRINGLEN)
132810
-    throw SEMANTIC_ERROR(_("String literal too long"), e->tok);
132810
-  result = this_prog.new_str(str); // will be lowered to a pointer by bpf-opt.cxx
132810
+  result = emit_literal_string(str, e->tok);
132810
 }
132810
 
132810
 void
132810
@@ -1783,7 +2238,7 @@ bpf_unparser::visit_target_register (target_register* e)
132810
 // ??? Could use 8-byte chunks if we're starved for instruction count.
132810
 // ??? Endianness of the target comes into play here.
132810
 value *
132810
-emit_literal_str(program &this_prog, insn_inserter &this_ins,
132810
+emit_simple_literal_str(program &this_prog, insn_inserter &this_ins,
132810
                  value *dest, int ofs, std::string &src, bool zero_pad)
132810
 {
132810
   size_t str_bytes = src.size() + 1;
132810
@@ -1835,15 +2290,15 @@ emit_literal_str(program &this_prog, insn_inserter &this_ins,
132810
 // ??? Could use 8-byte chunks if we're starved for instruction count.
132810
 // ??? Endianness of the target may come into play here.
132810
 value *
132810
-bpf_unparser::emit_copied_str(value *dest, int ofs, value *src, bool zero_pad)
132810
+bpf_unparser::emit_string_copy(value *dest, int ofs, value *src, bool zero_pad)
132810
 {
132810
   if (src->is_str())
132810
     {
132810
       /* If src is a string literal, its exact length is known and
132810
          we can emit simpler, unconditional string copying code. */
132810
       std::string str = src->str();
132810
-      return emit_literal_str(this_prog, this_ins,
132810
-                              dest, ofs, str, zero_pad);
132810
+      return emit_simple_literal_str(this_prog, this_ins,
132810
+                                     dest, ofs, str, zero_pad);
132810
     }
132810
 
132810
   size_t str_bytes = BPF_MAXSTRINGLEN;
132810
@@ -1931,7 +2386,7 @@ bpf_unparser::emit_copied_str(value *dest, int ofs, value *src, bool zero_pad)
132810
     }
132810
 
132810
   // XXX: Zero-padding is only used under specific circumstances;
132810
-  // see the corresponding comment in emit_literal_str().
132810
+  // see the corresponding comment in emit_simple_literal_str().
132810
   if (zero_pad)
132810
     {
132810
       for (unsigned i = 0; i < str_words; ++i)
132810
@@ -1977,7 +2432,7 @@ void
132810
 bpf_unparser::emit_str_arg(value *arg, int ofs, value *str)
132810
 {
132810
   value *frame = this_prog.lookup_reg(BPF_REG_10);
132810
-  value *out = emit_copied_str(frame, ofs, str, true /* zero pad */);
132810
+  value *out = emit_string_copy(frame, ofs, str, true /* zero pad */);
132810
   emit_mov(arg, out);
132810
 }
132810
 
132810
diff --git a/parse.h b/parse.h
132810
index 42b0bc5fd..96aef0394 100644
132810
--- a/parse.h
132810
+++ b/parse.h
132810
@@ -65,11 +65,25 @@ struct token
132810
   token_junk_type junk_type;
132810
 
132810
   std::string junk_message(systemtap_session& session) const;
132810
+
132810
+  // Creates a new token with the same content but different coordinates.
132810
+  // Can be used for exact error reporting *within* a token e.g. embedded-code.
132810
+  token *adjust_location(const source_loc &adjusted_loc) const
132810
+  { // TODO split from header
132810
+    token *new_tok = new token;
132810
+    new_tok->location = adjusted_loc;
132810
+    new_tok->content = content;
132810
+    new_tok->chain = chain;
132810
+    new_tok->type = type;
132810
+    new_tok->junk_type = junk_type;
132810
+    return new_tok;
132810
+  }
132810
   
132810
   friend class parser;
132810
   friend class lexer;
132810
 private:
132810
   void make_junk (token_junk_type);
132810
+
132810
   token(): chain(0), type(tok_junk), junk_type(tok_junk_unknown) {}
132810
   token(const token& other):
132810
     location(other.location), content(other.content),
132810
-- 
132810
2.14.5
132810