Blame SOURCES/rhbz1643997.0002-stapbpf-assembler-WIP-1-basic-parser-and-control-flo.patch

e4e640
From 17d4495bef5c3878bb38730ff0d849415b52641a Mon Sep 17 00:00:00 2001
e4e640
From: Serhei Makarov <smakarov@redhat.com>
e4e640
Date: Mon, 1 Oct 2018 15:38:16 -0400
e4e640
Subject: [PATCH 02/32] stapbpf assembler WIP #1 :: basic parser and control
e4e640
 flow
e4e640
e4e640
---
e4e640
 bpf-internal.h    |   7 +-
e4e640
 bpf-opt.cxx       |   2 +-
e4e640
 bpf-translate.cxx | 745 +++++++++++++++++++++++++++++++++++++++++++-----------
e4e640
 parse.h           |  14 +
e4e640
 4 files changed, 619 insertions(+), 149 deletions(-)
e4e640
e4e640
diff --git a/bpf-internal.h b/bpf-internal.h
e4e640
index 17a033533..719446db8 100644
e4e640
--- a/bpf-internal.h
e4e640
+++ b/bpf-internal.h
e4e640
@@ -261,9 +261,10 @@ struct program
e4e640
   void print(std::ostream &) const;
e4e640
 };
e4e640
 
e4e640
-// ??? Properly belongs to bpf_unparser but must be accessible from bpf-opt.cxx:
e4e640
-value *emit_literal_str(program &this_prog, insn_inserter &this_ins,
e4e640
-                        value *dest, int ofs, std::string &src, bool zero_pad = false);
e4e640
+// ??? Properly belongs to bpf_unparser but must be visible from bpf-opt.cxx:
e4e640
+value *emit_simple_literal_str(program &this_prog, insn_inserter &this_ins,
e4e640
+                               value *dest, int ofs, std::string &src,
e4e640
+                               bool zero_pad = false);
e4e640
 
e4e640
 inline std::ostream&
e4e640
 operator<< (std::ostream &o, const program &c)
e4e640
diff --git a/bpf-opt.cxx b/bpf-opt.cxx
e4e640
index 0f64d826d..c2e30a690 100644
e4e640
--- a/bpf-opt.cxx
e4e640
+++ b/bpf-opt.cxx
e4e640
@@ -41,7 +41,7 @@ alloc_literal_str(program &p, insn_inserter &ins, std::string &str)
e4e640
   int ofs = -tmp_space;
e4e640
 
e4e640
   value *frame = p.lookup_reg(BPF_REG_10);
e4e640
-  value *out = emit_literal_str(p, ins, frame, ofs, str, false /* don't zero pad */);
e4e640
+  value *out = emit_simple_literal_str(p, ins, frame, ofs, str, false /* don't zero pad */);
e4e640
   return out;
e4e640
 }
e4e640
 
e4e640
diff --git a/bpf-translate.cxx b/bpf-translate.cxx
e4e640
index d848c9f16..023ac6ce7 100644
e4e640
--- a/bpf-translate.cxx
e4e640
+++ b/bpf-translate.cxx
e4e640
@@ -8,6 +8,7 @@
e4e640
 
e4e640
 #include "config.h"
e4e640
 #include "bpf-internal.h"
e4e640
+#include "parse.h"
e4e640
 #include "staptree.h"
e4e640
 #include "elaborate.h"
e4e640
 #include "session.h"
e4e640
@@ -134,6 +135,9 @@ has_side_effects (expression *e)
e4e640
   return t.side_effects;
e4e640
 }
e4e640
 
e4e640
+/* forward declaration */
e4e640
+struct asm_stmt;
e4e640
+
e4e640
 struct bpf_unparser : public throwing_visitor
e4e640
 {
e4e640
   // The visitor class isn't as helpful as it might be.  As a consequence,
e4e640
@@ -233,10 +237,19 @@ struct bpf_unparser : public throwing_visitor
e4e640
   value *emit_expr(expression *e);
e4e640
   value *emit_bool(expression *e);
e4e640
   value *emit_context_var(bpf_context_vardecl *v);
e4e640
-  value *parse_reg(const std::string &str, embeddedcode *s);
e4e640
 
e4e640
-  // Used for copying string data:
e4e640
-  value *emit_copied_str(value *dest, int ofs, value *src, bool zero_pad = false);
e4e640
+  // Used for the embedded-code assembler:
e4e640
+  size_t parse_asm_stmt (embeddedcode *s, size_t start,
e4e640
+                           /*OUT*/asm_stmt &stmt);
e4e640
+  value *emit_asm_arg(const asm_stmt &stmt, const std::string &reg,
e4e640
+                      bool allow_imm = true);
e4e640
+  value *emit_asm_reg(const asm_stmt &stmt, const std::string ®);
e4e640
+  void emit_asm_opcode(const asm_stmt &stmt,
e4e640
+                       std::map<std::string, block *> label_map);
e4e640
+
e4e640
+  // Used for string data:
e4e640
+  value *emit_literal_string(const std::string &str, const token *tok);
e4e640
+  value *emit_string_copy(value *dest, int ofs, value *src, bool zero_pad = false);
e4e640
 
e4e640
   // Used for passing long and string arguments on the stack where an address is expected:
e4e640
   void emit_long_arg(value *arg, int ofs, value *val);
e4e640
@@ -552,172 +565,604 @@ bpf_unparser::visit_block (::block *s)
e4e640
     emit_stmt (s->statements[i]);
e4e640
 }
e4e640
 
e4e640
+/* WORK IN PROGRESS: A simple eBPF assembler.
e4e640
+
e4e640
+   In order to effectively write eBPF tapset functions, we want to use
e4e640
+   embedded-code assembly rather than compile from SystemTap code. At
e4e640
+   the same time, we want to hook into stapbpf functionality to
e4e640
+   reserve stack memory, allocate virtual registers or signal errors.
e4e640
+
e4e640
+   The assembler syntax will probably take a couple of attempts to get
e4e640
+   just right. This attempt keeps things as close as possible to the
e4e640
+   first embedded-code assembler, with a few more features and a
e4e640
+   disgustingly lenient parser that allows things like
e4e640
+     $ this is        all one "**identifier**" believe-it!-or-not
e4e640
+
e4e640
+   Ahh for the days of 1960s FORTRAN.
e4e640
+
e4e640
+   TODO: It might make more sense to implement an assembler based on
e4e640
+   the syntax used in official eBPF subsystem docs. */
e4e640
+
e4e640
+/* Possible assembly statement types include:
e4e640
+
e4e640
+   <stmt> ::= label, <dest=label>;
e4e640
+   <stmt> ::= <code=integer opcode>, <dest=reg>, <src1=reg>,
e4e640
+              <off/jmp_target=off>, <imm=imm>;
e4e640
+
e4e640
+   Possible argument types include:
e4e640
+
e4e640
+   <reg> ::= <register index> | r<register index> |
e4e640
+             $<identifier> | $<integer constant> | $$ | <string constant>
e4e640
+   <imm> ::= <integer constant> | BPF_MAXSTRINGLEN
e4e640
+   <off> ::= <imm> | <jump label>
e4e640
+
e4e640
+*/
e4e640
+
e4e640
+struct asm_stmt {
e4e640
+  std::string kind;
e4e640
+
e4e640
+  unsigned code;
e4e640
+  std::string dest, src1;
e4e640
+  int64_t off, imm;
e4e640
+
e4e640
+  // metadata for jmp instructions
e4e640
+  bool has_fallthrough = false;
e4e640
+  std::string jmp_target, fallthrough;
e4e640
+
e4e640
+  token *tok;
e4e640
+  bool deallocate_tok = false;
e4e640
+  ~asm_stmt() { if (deallocate_tok) delete tok; }
e4e640
+};
e4e640
+
e4e640
+std::ostream&
e4e640
+operator << (std::ostream& o, const asm_stmt& stmt)
e4e640
+{
e4e640
+  if (stmt.kind == "label")
e4e640
+    o << "label, " << stmt.dest << ";";
e4e640
+  else if (stmt.kind == "opcode")
e4e640
+    {
e4e640
+      o << std::hex << stmt.code << ", "
e4e640
+        << stmt.dest << ", "
e4e640
+        << stmt.src1 << ", ";
e4e640
+      if (stmt.off != 0 || stmt.jmp_target == "")
e4e640
+        o << stmt.off;
e4e640
+      else if (stmt.off != 0) // && stmt.jmp_target != ""
e4e640
+        o << stmt.off << "/";
e4e640
+      if (stmt.jmp_target != "")
e4e640
+        o << "label:" << stmt.jmp_target;
e4e640
+      o << ", "
e4e640
+        << stmt.imm << ";"
e4e640
+        << (stmt.has_fallthrough ? " +FALLTHROUGH " + stmt.fallthrough : "");
e4e640
+    }
e4e640
+  else
e4e640
+    o << "<unknown asm_stmt kind '" << stmt.kind << "'>";
e4e640
+  return o;
e4e640
+}
e4e640
+
e4e640
+bool
e4e640
+is_numeric (const std::string &str)
e4e640
+{
e4e640
+  size_t pos = 0;
e4e640
+  try {
e4e640
+    stol(str, &pos, 0);
e4e640
+  } catch (std::invalid_argument &e) {
e4e640
+    return false;
e4e640
+  }
e4e640
+  return (pos == str.size());
e4e640
+}
e4e640
+
e4e640
+/* Parse an assembly statement starting from position start in code,
e4e640
+   then write the output in stmt. Returns a position immediately after
e4e640
+   the parsed statement. */
e4e640
+size_t
e4e640
+bpf_unparser::parse_asm_stmt (embeddedcode *s, size_t start,
e4e640
+                              /*OUT*/asm_stmt &stmt)
e4e640
+{
e4e640
+  const interned_string &code = s->code;
e4e640
+
e4e640
+ retry:
e4e640
+  std::vector<std::string> args;
e4e640
+  unsigned n = code.size();
e4e640
+  bool in_comment = false;
e4e640
+  bool in_string = false;
e4e640
+
e4e640
+  // compute token with adjusted source location for diagnostics
e4e640
+  source_loc adjusted_loc; // TODO: ought to create a proper copy constructor for source_loc
e4e640
+  adjusted_loc.file = s->tok->location.file;
e4e640
+  adjusted_loc.line = s->tok->location.line;
e4e640
+  adjusted_loc.column = s->tok->location.column;
e4e640
+  for (size_t pos = 0; pos < start && pos < n; pos++)
e4e640
+    {
e4e640
+      // TODO: should save adjusted_loc state between parse_asm_stmt invocations; add field?
e4e640
+      char c = code[pos];
e4e640
+      if (c == '\n')
e4e640
+        {
e4e640
+          adjusted_loc.line++;
e4e640
+          adjusted_loc.column = 1;
e4e640
+        }
e4e640
+      else
e4e640
+        adjusted_loc.column++;
e4e640
+    }
e4e640
+
e4e640
+  // TODO: As before, parser is extremely non-rigorous and could do
e4e640
+  // with some tightening in terms of the inputs it accepts.
e4e640
+  size_t pos;
e4e640
+  std::string arg = "";
e4e640
+  for (pos = start; pos < n; pos++)
e4e640
+  {
e4e640
+    char c = code[pos];
e4e640
+    char c2 = pos + 1 < n ? code [pos + 1] : 0;
e4e640
+    if (isspace(c))
e4e640
+      continue; // skip
e4e640
+    else if (in_comment)
e4e640
+      {
e4e640
+        if (c == '*' && c2 == '/')
e4e640
+          ++pos, in_comment = false;
e4e640
+        // else skip
e4e640
+      }
e4e640
+    else if (in_string)
e4e640
+      {
e4e640
+        // resulting string will be processed by translate_escapes()
e4e640
+        if (c == '"')
e4e640
+          arg.push_back(c), in_string = false; // include quote
e4e640
+        else if (c == '\\' && c2 == '"')
e4e640
+          ++pos, arg.push_back(c), arg.push_back(c2);
e4e640
+        else // accept any char, including whitespace
e4e640
+          arg.push_back(c);
e4e640
+      }
e4e640
+    else if (c == '/' && c2 == '*')
e4e640
+      ++pos, in_comment = true;
e4e640
+    else if (c == '"') // found a literal string
e4e640
+      {
e4e640
+        // XXX: This allows '"' inside an arg and will treat the
e4e640
+        // string as a sequence of weird identifier characters.  A
e4e640
+        // more rigorous parser would error on mixing strings and
e4e640
+        // regular chars.
e4e640
+        arg.push_back(c); // include quote
e4e640
+        in_string = true;
e4e640
+      }
e4e640
+    else if (c == ',') // reached end of argument
e4e640
+      {
e4e640
+        // XXX: This strips out empty args. A more rigorous parser would error.
e4e640
+        if (arg != "")
e4e640
+          args.push_back(arg);
e4e640
+        arg = "";
e4e640
+      }
e4e640
+    else if (c == ';') // reached end of statement
e4e640
+      {
e4e640
+        // XXX: This strips out empty args. A more rigorous parser would error.
e4e640
+        if (arg != "")
e4e640
+          args.push_back(arg);
e4e640
+        arg = "";
e4e640
+        pos++; break;
e4e640
+      }
e4e640
+    else // found (we assume) a regular char
e4e640
+      {
e4e640
+        // XXX: As before, this strips whitespace within args
e4e640
+        // (so '$ab', '$ a b' and '$a b' are equivalent).
e4e640
+        //
e4e640
+        // A more rigorous parser would track in_arg
e4e640
+        // and after_arg states and error on whitespace within args.
e4e640
+        arg.push_back(c);
e4e640
+      }
e4e640
+  }
e4e640
+  // final ';' is optional, so we watch for a trailing arg:
e4e640
+  if (arg != "") args.push_back(arg);
e4e640
+
e4e640
+  // handle the case with no args
e4e640
+  if (args.empty() && pos >= n)
e4e640
+    return std::string::npos; // finished parsing
e4e640
+  else if (args.empty())
e4e640
+    {
e4e640
+      // XXX: This skips an empty statement.
e4e640
+      // A more rigorous parser would error.
e4e640
+      start = pos;
e4e640
+      goto retry;
e4e640
+    }
e4e640
+
e4e640
+  // set token with adjusted source location
e4e640
+  //stmt.tok = (token *)s->tok;
e4e640
+  // TODO this segfaults for some reason, some data not copied?
e4e640
+  stmt.tok = s->tok->adjust_location(adjusted_loc);
e4e640
+  stmt.deallocate_tok = false; // TODO must avoid destroy-on-copy
e4e640
+
e4e640
+  std::cerr << "DEBUG GOT stmt "; // TODO
e4e640
+  for (unsigned k = 0; k < args.size(); k++) std::cerr << args[k] << " / ";
e4e640
+  std::cerr << std::endl; // TODO
e4e640
+  if (args[0] == "label")
e4e640
+    {
e4e640
+      if (args.size() != 2)
e4e640
+        throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), stmt.tok);
e4e640
+      stmt.kind = args[0];
e4e640
+      stmt.dest = args[1];
e4e640
+    }
e4e640
+  else if (is_numeric(args[0]))
e4e640
+    {
e4e640
+      if (args.size() != 5) // TODO change to 4 to test err+tok
e4e640
+        throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), stmt.tok);
e4e640
+      stmt.kind = "opcode";
e4e640
+      stmt.code = stoul(args[0], 0, 0); // TODO signal error
e4e640
+      stmt.dest = args[1];
e4e640
+      stmt.src1 = args[2];
e4e640
+
e4e640
+      bool has_jmp_target =
e4e640
+        BPF_CLASS(stmt.code) == BPF_JMP
e4e640
+        && BPF_OP(stmt.code) != BPF_EXIT
e4e640
+        && BPF_OP(stmt.code) != BPF_CALL;
e4e640
+      stmt.has_fallthrough = // only for jcond
e4e640
+        has_jmp_target
e4e640
+        && BPF_OP(stmt.code) != BPF_JA;
e4e640
+      // XXX: stmt.fallthrough is computed by visit_embeddedcode
e4e640
+
e4e640
+      if (has_jmp_target)
e4e640
+        {
e4e640
+          stmt.off = 0;
e4e640
+          stmt.jmp_target = args[3];
e4e640
+        }
e4e640
+      else if (args[3] == "BPF_MAXSTRINGLEN")
e4e640
+        stmt.off = BPF_MAXSTRINGLEN;
e4e640
+      else if (args[3] == "-")
e4e640
+        stmt.off = 0;
e4e640
+      else
e4e640
+        stmt.off = stol(args[3]); // TODO signal error
e4e640
+
e4e640
+      if (args[4] == "BPF_MAXSTRINGLEN")
e4e640
+        stmt.imm = BPF_MAXSTRINGLEN;
e4e640
+      else if (args[4] == "-")
e4e640
+        stmt.imm = 0;
e4e640
+      else
e4e640
+        stmt.imm = stol(args[4]); // TODO signal error
e4e640
+    }
e4e640
+  else
e4e640
+    throw SEMANTIC_ERROR (_F("unknown bpf embeddedcode operator '%s'",
e4e640
+                             args[0].c_str()), stmt.tok);
e4e640
+
e4e640
+  // we returned a statement, so there's more parsing to be done
e4e640
+  return pos;
e4e640
+}
e4e640
+
e4e640
+/* forward declaration */
e4e640
+std::string translate_escapes (const interned_string &str);
e4e640
+
e4e640
+/* Convert a <reg> or <imm> operand to a value.
e4e640
+   May emit code to store a string constant on the stack. */
e4e640
 value *
e4e640
-bpf_unparser::parse_reg(const std::string &str, embeddedcode *s)
e4e640
+bpf_unparser::emit_asm_arg (const asm_stmt &stmt, const std::string &arg,
e4e640
+                            bool allow_imm)
e4e640
 {
e4e640
-  if (str == "$$")
e4e640
+  if (arg == "$$")
e4e640
     {
e4e640
-      if (func_return.empty ())
e4e640
-	throw SEMANTIC_ERROR (_("no return value outside function"), s->tok);
e4e640
+      /* arg is a return value */
e4e640
+      if (func_return.empty())
e4e640
+        throw SEMANTIC_ERROR (_("no return value outside function"), stmt.tok);
e4e640
       return func_return_val.back();
e4e640
     }
e4e640
-  else if (str[0] == '$')
e4e640
+  else if (arg[0] == '$')
e4e640
     {
e4e640
-      std::string var = str.substr(1);
e4e640
+      /* assume arg is a variable */
e4e640
+      std::string var = arg.substr(1);
e4e640
       for (auto i = this_locals->begin(); i != this_locals->end(); ++i)
e4e640
 	{
e4e640
 	  vardecl *v = i->first;
e4e640
 	  if (var == v->unmangled_name)
e4e640
 	    return i->second;
e4e640
 	}
e4e640
-      throw SEMANTIC_ERROR (_("unknown variable"), s->tok);
e4e640
+
e4e640
+      /* if it's an unknown variable, allocate a temporary */
e4e640
+      struct vardecl *vd = new vardecl;
e4e640
+      vd->name = "__bpfasm__local_" + var;
e4e640
+      vd->unmangled_name = var;
e4e640
+      vd->type = pe_long;
e4e640
+      vd->arity = 0;
e4e640
+      value *reg = this_prog.new_reg();
e4e640
+      const locals_map::value_type v (vd, reg);
e4e640
+      auto ok = this_locals->insert (v);
e4e640
+      assert (ok.second);
e4e640
+      return reg;
e4e640
+      // TODO write a testcase
e4e640
     }
e4e640
-  else
e4e640
+  else if (is_numeric(arg) && allow_imm)
e4e640
     {
e4e640
-      unsigned long num = stoul(str, 0, 0);
e4e640
+      /* arg is an immediate constant */
e4e640
+      long imm = stol(arg, 0, 0);
e4e640
+      return this_prog.new_imm(imm);
e4e640
+    }
e4e640
+  else if (is_numeric(arg) || arg[0] == 'r')
e4e640
+    {
e4e640
+      /* arg is a register number */
e4e640
+      std::string reg = arg[0] == 'r' ? arg.substr(1) : arg;
e4e640
+      unsigned long num = stoul(reg, 0, 0);
e4e640
       if (num > 10)
e4e640
-	throw SEMANTIC_ERROR (_("invalid bpf register"), s->tok);
e4e640
+	throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
e4e640
+                                 arg.c_str()), stmt.tok);
e4e640
       return this_prog.lookup_reg(num);
e4e640
     }
e4e640
+  else if (arg[0] == '"')
e4e640
+    {
e4e640
+      // TODO verify correctness
e4e640
+      /* arg is a string constant */
e4e640
+      if (arg[arg.size() - 1] != '"')
e4e640
+        throw SEMANTIC_ERROR (_F("BUG: improper string %s",
e4e640
+                                 arg.c_str()), stmt.tok);
e4e640
+      std::string escaped_str = arg.substr(1,arg.size()-2); /* strip quotes */
e4e640
+      std::string str = translate_escapes(escaped_str); // TODO interned_str?
e4e640
+      return emit_literal_string(str, stmt.tok);
e4e640
+    }
e4e640
+  else if (arg == "BPF_MAXSTRINGLEN")
e4e640
+    {
e4e640
+      /* arg is BPF_MAXSTRINGLEN */
e4e640
+      if (!allow_imm)
e4e640
+        throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
e4e640
+                                 arg.c_str()), stmt.tok);
e4e640
+      return this_prog.new_imm(BPF_MAXSTRINGLEN);
e4e640
+    }
e4e640
+  else if (arg == "-")
e4e640
+    {
e4e640
+      /* arg is null a.k.a '0' */
e4e640
+      if (!allow_imm)
e4e640
+        throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
e4e640
+                                 arg.c_str()), stmt.tok);
e4e640
+      return this_prog.new_imm(0);
e4e640
+    }
e4e640
+  else if (allow_imm)
e4e640
+    throw SEMANTIC_ERROR (_F("invalid bpf argument '%s'",
e4e640
+                             arg.c_str()), stmt.tok);
e4e640
+  else
e4e640
+    throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
e4e640
+                             arg.c_str()), stmt.tok);
e4e640
+  
e4e640
+}
e4e640
+
e4e640
+value *
e4e640
+bpf_unparser::emit_asm_reg (const asm_stmt &stmt, const std::string &reg)
e4e640
+{
e4e640
+  return emit_asm_arg(stmt, reg, /*allow_imm=*/false);
e4e640
 }
e4e640
 
e4e640
 void
e4e640
-bpf_unparser::visit_embeddedcode (embeddedcode *s)
e4e640
+bpf_unparser::emit_asm_opcode (const asm_stmt &stmt,
e4e640
+                               std::map<std::string, block *> label_map)
e4e640
 {
e4e640
-  std::string strip;
e4e640
-  {
e4e640
-    const interned_string &code = s->code;
e4e640
-    unsigned n = code.size();
e4e640
-    bool in_comment = false;
e4e640
+  if (stmt.code > 0xff && stmt.code != BPF_LD_MAP)
e4e640
+    throw SEMANTIC_ERROR (_("invalid bpf code"), stmt.tok);
e4e640
 
e4e640
-    for (unsigned i = 0; i < n; ++i)
e4e640
-      {
e4e640
-	char c = code[i];
e4e640
-	if (isspace(c))
e4e640
-	  continue;
e4e640
-	if (in_comment)
e4e640
-	  {
e4e640
-	    if (c == '*' && code[i + 1] == '/')
e4e640
-	      ++i, in_comment = false;
e4e640
-	  }
e4e640
-	else if (c == '/' && code[i + 1] == '*')
e4e640
-	  ++i, in_comment = true;
e4e640
-	else
e4e640
-	  strip += c;
e4e640
-      }
e4e640
-  }
e4e640
+  bool r_dest = false, r_src0 = false, r_src1 = false, i_src1 = false;
e4e640
+  bool op_jmp = false, op_jcond = false; condition c;
e4e640
+  switch (BPF_CLASS (stmt.code))
e4e640
+    {
e4e640
+    case BPF_LDX:
e4e640
+      r_dest = r_src1 = true;
e4e640
+      break;
e4e640
+    case BPF_STX:
e4e640
+      r_src0 = r_src1 = true;
e4e640
+      break;
e4e640
+    case BPF_ST:
e4e640
+      r_src0 = i_src1 = true;
e4e640
+      break;
e4e640
+
e4e640
+    case BPF_ALU:
e4e640
+    case BPF_ALU64:
e4e640
+      r_dest = true;
e4e640
+      if (stmt.code & BPF_X)
e4e640
+        r_src1 = true;
e4e640
+      else
e4e640
+        i_src1 = true;
e4e640
+      switch (BPF_OP (stmt.code))
e4e640
+        {
e4e640
+        case BPF_NEG:
e4e640
+        case BPF_MOV:
e4e640
+          break;
e4e640
+        case BPF_END:
e4e640
+          /* X/K bit repurposed as LE/BE.  */
e4e640
+          i_src1 = false, r_src1 = true;
e4e640
+          break;
e4e640
+        default:
e4e640
+          r_src0 = true;
e4e640
+        }
e4e640
+      break;
e4e640
+
e4e640
+    case BPF_JMP:
e4e640
+      switch (BPF_OP (stmt.code))
e4e640
+        {
e4e640
+        case BPF_EXIT:
e4e640
+          // no special treatment needed
e4e640
+          break;
e4e640
+        case BPF_CALL:
e4e640
+          i_src1 = true;
e4e640
+          break;
e4e640
+        case BPF_JA:
e4e640
+          op_jmp = true;
e4e640
+          break;
e4e640
+        default:
e4e640
+          // XXX: assume this is a jcond op
e4e640
+          op_jcond = true;
e4e640
+          r_src0 = true;
e4e640
+          if (stmt.code & BPF_X)
e4e640
+            r_src1 = true;
e4e640
+          else
e4e640
+            i_src1 = true;
e4e640
+        }
e4e640
+
e4e640
+      // compute jump condition c
e4e640
+      switch (BPF_OP (stmt.code))
e4e640
+        {
e4e640
+        case BPF_JEQ: c = EQ; break;
e4e640
+        case BPF_JNE: c = NE; break;
e4e640
+        case BPF_JGT: c = GTU; break;
e4e640
+        case BPF_JGE: c = GEU; break;
e4e640
+        case BPF_JLT: c = LTU; break;
e4e640
+        case BPF_JLE: c = LEU; break;
e4e640
+        case BPF_JSGT: c = GT; break;
e4e640
+        case BPF_JSGE: c = GE; break;
e4e640
+        case BPF_JSLT: c = LT; break;
e4e640
+        case BPF_JSLE: c = LE; break;
e4e640
+        case BPF_JSET: c = TEST; break;
e4e640
+        default:
e4e640
+          if (op_jcond)
e4e640
+            throw SEMANTIC_ERROR (_("invalid branch in bpf code"), stmt.tok);
e4e640
+        }
e4e640
+      break;
e4e640
+
e4e640
+    default:
e4e640
+      if (stmt.code == BPF_LD_MAP)
e4e640
+        r_dest = true, i_src1 = true;
e4e640
+      else
e4e640
+        throw SEMANTIC_ERROR (_F("unknown opcode '%d' in bpf code",
e4e640
+                                stmt.code), stmt.tok);
e4e640
+    }
e4e640
 
e4e640
-  std::istringstream ii (strip);
e4e640
-  ii >> std::setbase(0);
e4e640
+  value *v_dest = NULL;
e4e640
+  if (r_dest || r_src0)
e4e640
+    v_dest = emit_asm_reg(stmt, stmt.dest);
e4e640
+  else if (stmt.dest != "0" && stmt.dest != "-")
e4e640
+    throw SEMANTIC_ERROR (_F("invalid register field '%s' in bpf code",
e4e640
+                             stmt.dest.c_str()), stmt.tok);
e4e640
 
e4e640
-  while (true)
e4e640
+  value *v_src1 = NULL;
e4e640
+  if (r_src1)
e4e640
+    v_src1 = emit_asm_reg(stmt, stmt.src1);
e4e640
+  else
e4e640
     {
e4e640
-      unsigned code;
e4e640
-      char s1, s2, s3, s4;
e4e640
-      char dest_b[256], src1_b[256];
e4e640
-      int64_t off, imm;
e4e640
+      if (stmt.src1 != "0" && stmt.src1 != "-")
e4e640
+        throw SEMANTIC_ERROR (_F("invalid register field '%s' in bpf code",
e4e640
+                                 stmt.src1.c_str()), stmt.tok);
e4e640
+      if (i_src1)
e4e640
+        v_src1 = this_prog.new_imm(stmt.imm);
e4e640
+      else if (stmt.imm != 0)
e4e640
+        throw SEMANTIC_ERROR (_("invalid immediate field in bpf code"), stmt.tok);
e4e640
+    }
e4e640
 
e4e640
-      ii >> code >> s1;
e4e640
-      ii.get(dest_b, sizeof(dest_b), ',') >> s2;
e4e640
-      ii.get(src1_b, sizeof(src1_b), ',') >> s3;
e4e640
-      ii >> off >> s4 >> imm;
e4e640
+  if (stmt.off != (int16_t)stmt.off)
e4e640
+    throw SEMANTIC_ERROR (_F("offset field '%ld' out of range in bpf code", stmt.off), stmt.tok);
e4e640
 
e4e640
-      if (ii.fail() || s1 != ',' || s2 != ',' || s3 != ',' || s4 != ',')
e4e640
-	throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), s->tok);
e4e640
+  if (op_jmp)
e4e640
+    {
e4e640
+      block *target = label_map[stmt.jmp_target];
e4e640
+      this_prog.mk_jmp(this_ins, target);
e4e640
+    }
e4e640
+  else if (op_jcond)
e4e640
+    {
e4e640
+      if (label_map.count(stmt.jmp_target) == 0)
e4e640
+        throw SEMANTIC_ERROR(_F("undefined jump target '%s' in bpf code",
e4e640
+                                stmt.jmp_target.c_str()), stmt.tok);
e4e640
+      if (label_map.count(stmt.fallthrough) == 0)
e4e640
+        throw SEMANTIC_ERROR(_F("BUG: undefined fallthrough target '%s'",
e4e640
+                                stmt.fallthrough.c_str()), stmt.tok);
e4e640
+      block *target = label_map[stmt.jmp_target];
e4e640
+      block *fallthrough = label_map[stmt.fallthrough];
e4e640
+      this_prog.mk_jcond(this_ins, c, v_dest, v_src1, target, fallthrough);
e4e640
+    }
e4e640
+  else // regular opcode
e4e640
+    {
e4e640
+      insn *i = this_ins.new_insn();
e4e640
+      i->code = stmt.code;
e4e640
+      i->dest = (r_dest ? v_dest : NULL);
e4e640
+      i->src0 = (r_src0 ? v_dest : NULL);
e4e640
+      i->src1 = v_src1;
e4e640
+      i->off = stmt.off;
e4e640
+    }
e4e640
+}
e4e640
 
e4e640
-      if (code > 0xff && code != BPF_LD_MAP)
e4e640
-	throw SEMANTIC_ERROR (_("invalid bpf code"), s->tok);
e4e640
+void
e4e640
+bpf_unparser::visit_embeddedcode (embeddedcode *s)
e4e640
+{
e4e640
+  std::vector<asm_stmt> statements;
e4e640
+  asm_stmt stmt;
e4e640
 
e4e640
-      bool r_dest = false, r_src0 = false, r_src1 = false, i_src1 = false;
e4e640
-      switch (BPF_CLASS (code))
e4e640
-	{
e4e640
-	case BPF_LDX:
e4e640
-	  r_dest = r_src1 = true;
e4e640
-	  break;
e4e640
-	case BPF_STX:
e4e640
-	  r_src0 = r_src1 = true;
e4e640
-	  break;
e4e640
-	case BPF_ST:
e4e640
-	  r_src0 = i_src1 = true;
e4e640
-	  break;
e4e640
+  size_t pos = 0;
e4e640
+  while ((pos = parse_asm_stmt(s, pos, stmt)) != std::string::npos)
e4e640
+    {
e4e640
+      statements.push_back(stmt);
e4e640
+    }
e4e640
 
e4e640
-	case BPF_ALU:
e4e640
-	case BPF_ALU64:
e4e640
-	  r_dest = true;
e4e640
-	  if (code & BPF_X)
e4e640
-	    r_src1 = true;
e4e640
-	  else
e4e640
-	    i_src1 = true;
e4e640
-	  switch (BPF_OP (code))
e4e640
-	    {
e4e640
-	    case BPF_NEG:
e4e640
-	    case BPF_MOV:
e4e640
-	      break;
e4e640
-	    case BPF_END:
e4e640
-	      /* X/K bit repurposed as LE/BE.  */
e4e640
-	      i_src1 = false, r_src1 = true;
e4e640
-	      break;
e4e640
-	    default:
e4e640
-	      r_src0 = true;
e4e640
-	    }
e4e640
-	  break;
e4e640
+  // build basic block table
e4e640
+  std::map<std::string, block *> label_map;
e4e640
+  block *entry_block = this_ins.b;
e4e640
+  label_map[";;entry"] = entry_block;
e4e640
 
e4e640
-	case BPF_JMP:
e4e640
-	  switch (BPF_OP (code))
e4e640
-	    {
e4e640
-	    case BPF_EXIT:
e4e640
-	      break;
e4e640
-	    case BPF_CALL:
e4e640
-	      i_src1 = true;
e4e640
-	      break;
e4e640
-	    default:
e4e640
-	      throw SEMANTIC_ERROR (_("invalid branch in bpf code"), s->tok);
e4e640
-	    }
e4e640
-	  break;
e4e640
+  bool after_label = true;
e4e640
+  asm_stmt *after_jump = NULL;
e4e640
+  unsigned fallthrough_count = 0;
e4e640
+  for (std::vector<asm_stmt>::iterator it = statements.begin();
e4e640
+       it != statements.end(); it++)
e4e640
+    {
e4e640
+      stmt = *it;
e4e640
 
e4e640
-	default:
e4e640
-          if (code == BPF_LD_MAP)
e4e640
-            r_dest = true, i_src1 = true;
e4e640
-          else
e4e640
-	    throw SEMANTIC_ERROR (_("unknown opcode in bpf code"), s->tok);
e4e640
-	}
e4e640
+      if (after_jump != NULL && stmt.kind == "label")
e4e640
+        {
e4e640
+          after_jump->fallthrough = stmt.dest;
e4e640
+        }
e4e640
+      else if (after_jump != NULL)
e4e640
+        {
e4e640
+          block *b = this_prog.new_block();
e4e640
 
e4e640
-      std::string dest(dest_b);
e4e640
-      value *v_dest = NULL;
e4e640
-      if (r_dest || r_src0)
e4e640
-	v_dest = parse_reg(dest, s);
e4e640
-      else if (dest != "0")
e4e640
-	throw SEMANTIC_ERROR (_("invalid register field in bpf code"), s->tok);
e4e640
-
e4e640
-      std::string src1(src1_b);
e4e640
-      value *v_src1 = NULL;
e4e640
-      if (r_src1)
e4e640
-	v_src1 = parse_reg(src1, s);
e4e640
-      else
e4e640
-	{
e4e640
-	  if (src1 != "0")
e4e640
-	    throw SEMANTIC_ERROR (_("invalid register field in bpf code"), s->tok);
e4e640
-	  if (i_src1)
e4e640
-	    v_src1 = this_prog.new_imm(imm);
e4e640
-	  else if (imm != 0)
e4e640
-	    throw SEMANTIC_ERROR (_("invalid immediate field in bpf code"), s->tok);
e4e640
-	}
e4e640
+          // generate unique label for fallthrough edge
e4e640
+          std::ostringstream oss;
e4e640
+          oss << "fallthrough;;" << fallthrough_count++;
e4e640
+          std::string fallthrough_label = oss.str();
e4e640
+          // XXX: semicolons prevent collision with programmer-defined labels
e4e640
 
e4e640
-      if (off != (int16_t)off)
e4e640
-	throw SEMANTIC_ERROR (_("offset field out of range in bpf code"), s->tok);
e4e640
+          label_map[fallthrough_label] = b;
e4e640
+          set_block(b);
e4e640
 
e4e640
-      insn *i = this_ins.new_insn();
e4e640
-      i->code = code;
e4e640
-      i->dest = (r_dest ? v_dest : NULL);
e4e640
-      i->src0 = (r_src0 ? v_dest : NULL);
e4e640
-      i->src1 = v_src1;
e4e640
-      i->off = off;
e4e640
+          after_jump->fallthrough = fallthrough_label;
e4e640
+        }
e4e640
 
e4e640
-      ii >> s1;
e4e640
-      if (ii.eof())
e4e640
-	break;
e4e640
-      if (s1 != ';')
e4e640
-	throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), s->tok);
e4e640
+      if (stmt.kind == "label" && after_label)
e4e640
+        {
e4e640
+          // avoid creating multiple blocks for consecutive labels
e4e640
+          label_map[stmt.dest] = this_ins.b;
e4e640
+          after_jump = NULL;
e4e640
+        }
e4e640
+      else if (stmt.kind == "label")
e4e640
+        {
e4e640
+          block *b = this_prog.new_block();
e4e640
+          label_map[stmt.dest] = b;
e4e640
+          set_block(b);
e4e640
+          after_label = true;
e4e640
+          after_jump = NULL;
e4e640
+        }
e4e640
+      else if (stmt.has_fallthrough)
e4e640
+        {
e4e640
+          after_label = false;
e4e640
+          after_jump = &*it; // be sure to refer to original, not copied stmt
e4e640
+        }
e4e640
+      else
e4e640
+        {
e4e640
+          after_label = false;
e4e640
+          after_jump = NULL;
e4e640
+        }
e4e640
+    }
e4e640
+  if (after_jump != NULL) // TODO: should just fall through to exit
e4e640
+    throw SEMANTIC_ERROR (_("BUG: bpf embeddedcode doesn't support "
e4e640
+                            "fallthrough on final asm_stmt"), stmt.tok);
e4e640
+
e4e640
+  // emit statements
e4e640
+  bool jumped_already = true;
e4e640
+  set_block(entry_block);
e4e640
+  for (std::vector<asm_stmt>::iterator it = statements.begin();
e4e640
+       it != statements.end(); it++)
e4e640
+    {
e4e640
+      stmt = *it;
e4e640
+      std::cerr << "DEBUG processing " << stmt << std::endl; // TODO
e4e640
+      if (stmt.kind == "label")
e4e640
+        {
e4e640
+          // TODO: be sure there's no gap in the edge
e4e640
+          if (!jumped_already)
e4e640
+            emit_jmp (label_map[stmt.dest]);
e4e640
+          set_block(label_map[stmt.dest]);
e4e640
+        }
e4e640
+      else if (stmt.kind == "opcode")
e4e640
+        {
e4e640
+          emit_asm_opcode (stmt, label_map);
e4e640
+        }
e4e640
+      else
e4e640
+        throw SEMANTIC_ERROR (_F("BUG: bpf embeddedcode contains unexpected "
e4e640
+                                 "asm_stmt kind '%s'", stmt.kind.c_str()),
e4e640
+                              stmt.tok);
e4e640
+      jumped_already = stmt.has_fallthrough;
e4e640
+      if (stmt.has_fallthrough)
e4e640
+        set_block(label_map[stmt.fallthrough]);
e4e640
     }
e4e640
 }
e4e640
 
e4e640
@@ -1016,8 +1461,13 @@ bpf_unparser::visit_delete_statement (delete_statement *s)
e4e640
 }
e4e640
 
e4e640
 // Translate string escape characters.
e4e640
+// Accepts strings produced by parse.cxx lexer::scan and
e4e640
+// by the eBPF embedded-code assembler.
e4e640
+//
e4e640
+// PR23559: This is currently an eBPF-only version of the function
e4e640
+// that does not translate octal escapes.
e4e640
 std::string
e4e640
-translate_escapes (interned_string &str)
e4e640
+translate_escapes (const interned_string &str)
e4e640
 {
e4e640
   std::string result;
e4e640
   bool saw_esc = false;
e4e640
@@ -1045,16 +1495,21 @@ translate_escapes (interned_string &str)
e4e640
   return result;
e4e640
 }
e4e640
 
e4e640
+value *
e4e640
+bpf_unparser::emit_literal_string (const std::string &str, const token *tok)
e4e640
+{
e4e640
+  size_t str_bytes = str.size() + 1;
e4e640
+  if (str_bytes > BPF_MAXSTRINGLEN)
e4e640
+    throw SEMANTIC_ERROR(_("string literal too long"), tok);
e4e640
+  return this_prog.new_str(str); // will be lowered to a pointer by bpf-opt.cxx
e4e640
+}
e4e640
+
e4e640
 void
e4e640
 bpf_unparser::visit_literal_string (literal_string* e)
e4e640
 {
e4e640
   interned_string v = e->value;
e4e640
   std::string str = translate_escapes(v);
e4e640
-
e4e640
-  size_t str_bytes = str.size() + 1;
e4e640
-  if (str_bytes > BPF_MAXSTRINGLEN)
e4e640
-    throw SEMANTIC_ERROR(_("String literal too long"), e->tok);
e4e640
-  result = this_prog.new_str(str); // will be lowered to a pointer by bpf-opt.cxx
e4e640
+  result = emit_literal_string(str, e->tok);
e4e640
 }
e4e640
 
e4e640
 void
e4e640
@@ -1783,7 +2238,7 @@ bpf_unparser::visit_target_register (target_register* e)
e4e640
 // ??? Could use 8-byte chunks if we're starved for instruction count.
e4e640
 // ??? Endianness of the target comes into play here.
e4e640
 value *
e4e640
-emit_literal_str(program &this_prog, insn_inserter &this_ins,
e4e640
+emit_simple_literal_str(program &this_prog, insn_inserter &this_ins,
e4e640
                  value *dest, int ofs, std::string &src, bool zero_pad)
e4e640
 {
e4e640
   size_t str_bytes = src.size() + 1;
e4e640
@@ -1835,15 +2290,15 @@ emit_literal_str(program &this_prog, insn_inserter &this_ins,
e4e640
 // ??? Could use 8-byte chunks if we're starved for instruction count.
e4e640
 // ??? Endianness of the target may come into play here.
e4e640
 value *
e4e640
-bpf_unparser::emit_copied_str(value *dest, int ofs, value *src, bool zero_pad)
e4e640
+bpf_unparser::emit_string_copy(value *dest, int ofs, value *src, bool zero_pad)
e4e640
 {
e4e640
   if (src->is_str())
e4e640
     {
e4e640
       /* If src is a string literal, its exact length is known and
e4e640
          we can emit simpler, unconditional string copying code. */
e4e640
       std::string str = src->str();
e4e640
-      return emit_literal_str(this_prog, this_ins,
e4e640
-                              dest, ofs, str, zero_pad);
e4e640
+      return emit_simple_literal_str(this_prog, this_ins,
e4e640
+                                     dest, ofs, str, zero_pad);
e4e640
     }
e4e640
 
e4e640
   size_t str_bytes = BPF_MAXSTRINGLEN;
e4e640
@@ -1931,7 +2386,7 @@ bpf_unparser::emit_copied_str(value *dest, int ofs, value *src, bool zero_pad)
e4e640
     }
e4e640
 
e4e640
   // XXX: Zero-padding is only used under specific circumstances;
e4e640
-  // see the corresponding comment in emit_literal_str().
e4e640
+  // see the corresponding comment in emit_simple_literal_str().
e4e640
   if (zero_pad)
e4e640
     {
e4e640
       for (unsigned i = 0; i < str_words; ++i)
e4e640
@@ -1977,7 +2432,7 @@ void
e4e640
 bpf_unparser::emit_str_arg(value *arg, int ofs, value *str)
e4e640
 {
e4e640
   value *frame = this_prog.lookup_reg(BPF_REG_10);
e4e640
-  value *out = emit_copied_str(frame, ofs, str, true /* zero pad */);
e4e640
+  value *out = emit_string_copy(frame, ofs, str, true /* zero pad */);
e4e640
   emit_mov(arg, out);
e4e640
 }
e4e640
 
e4e640
diff --git a/parse.h b/parse.h
e4e640
index 42b0bc5fd..96aef0394 100644
e4e640
--- a/parse.h
e4e640
+++ b/parse.h
e4e640
@@ -65,11 +65,25 @@ struct token
e4e640
   token_junk_type junk_type;
e4e640
 
e4e640
   std::string junk_message(systemtap_session& session) const;
e4e640
+
e4e640
+  // Creates a new token with the same content but different coordinates.
e4e640
+  // Can be used for exact error reporting *within* a token e.g. embedded-code.
e4e640
+  token *adjust_location(const source_loc &adjusted_loc) const
e4e640
+  { // TODO split from header
e4e640
+    token *new_tok = new token;
e4e640
+    new_tok->location = adjusted_loc;
e4e640
+    new_tok->content = content;
e4e640
+    new_tok->chain = chain;
e4e640
+    new_tok->type = type;
e4e640
+    new_tok->junk_type = junk_type;
e4e640
+    return new_tok;
e4e640
+  }
e4e640
   
e4e640
   friend class parser;
e4e640
   friend class lexer;
e4e640
 private:
e4e640
   void make_junk (token_junk_type);
e4e640
+
e4e640
   token(): chain(0), type(tok_junk), junk_type(tok_junk_unknown) {}
e4e640
   token(const token& other):
e4e640
     location(other.location), content(other.content),
e4e640
-- 
e4e640
2.14.5
e4e640