Blob Blame History Raw
From 17d4495bef5c3878bb38730ff0d849415b52641a Mon Sep 17 00:00:00 2001
From: Serhei Makarov <smakarov@redhat.com>
Date: Mon, 1 Oct 2018 15:38:16 -0400
Subject: [PATCH 02/32] stapbpf assembler WIP #1 :: basic parser and control
 flow

---
 bpf-internal.h    |   7 +-
 bpf-opt.cxx       |   2 +-
 bpf-translate.cxx | 745 +++++++++++++++++++++++++++++++++++++++++++-----------
 parse.h           |  14 +
 4 files changed, 619 insertions(+), 149 deletions(-)

diff --git a/bpf-internal.h b/bpf-internal.h
index 17a033533..719446db8 100644
--- a/bpf-internal.h
+++ b/bpf-internal.h
@@ -261,9 +261,10 @@ struct program
   void print(std::ostream &) const;
 };
 
-// ??? Properly belongs to bpf_unparser but must be accessible from bpf-opt.cxx:
-value *emit_literal_str(program &this_prog, insn_inserter &this_ins,
-                        value *dest, int ofs, std::string &src, bool zero_pad = false);
+// ??? Properly belongs to bpf_unparser but must be visible from bpf-opt.cxx:
+value *emit_simple_literal_str(program &this_prog, insn_inserter &this_ins,
+                               value *dest, int ofs, std::string &src,
+                               bool zero_pad = false);
 
 inline std::ostream&
 operator<< (std::ostream &o, const program &c)
diff --git a/bpf-opt.cxx b/bpf-opt.cxx
index 0f64d826d..c2e30a690 100644
--- a/bpf-opt.cxx
+++ b/bpf-opt.cxx
@@ -41,7 +41,7 @@ alloc_literal_str(program &p, insn_inserter &ins, std::string &str)
   int ofs = -tmp_space;
 
   value *frame = p.lookup_reg(BPF_REG_10);
-  value *out = emit_literal_str(p, ins, frame, ofs, str, false /* don't zero pad */);
+  value *out = emit_simple_literal_str(p, ins, frame, ofs, str, false /* don't zero pad */);
   return out;
 }
 
diff --git a/bpf-translate.cxx b/bpf-translate.cxx
index d848c9f16..023ac6ce7 100644
--- a/bpf-translate.cxx
+++ b/bpf-translate.cxx
@@ -8,6 +8,7 @@
 
 #include "config.h"
 #include "bpf-internal.h"
+#include "parse.h"
 #include "staptree.h"
 #include "elaborate.h"
 #include "session.h"
@@ -134,6 +135,9 @@ has_side_effects (expression *e)
   return t.side_effects;
 }
 
+/* forward declaration */
+struct asm_stmt;
+
 struct bpf_unparser : public throwing_visitor
 {
   // The visitor class isn't as helpful as it might be.  As a consequence,
@@ -233,10 +237,19 @@ struct bpf_unparser : public throwing_visitor
   value *emit_expr(expression *e);
   value *emit_bool(expression *e);
   value *emit_context_var(bpf_context_vardecl *v);
-  value *parse_reg(const std::string &str, embeddedcode *s);
 
-  // Used for copying string data:
-  value *emit_copied_str(value *dest, int ofs, value *src, bool zero_pad = false);
+  // Used for the embedded-code assembler:
+  size_t parse_asm_stmt (embeddedcode *s, size_t start,
+                           /*OUT*/asm_stmt &stmt);
+  value *emit_asm_arg(const asm_stmt &stmt, const std::string &reg,
+                      bool allow_imm = true);
+  value *emit_asm_reg(const asm_stmt &stmt, const std::string &reg);
+  void emit_asm_opcode(const asm_stmt &stmt,
+                       std::map<std::string, block *> label_map);
+
+  // Used for string data:
+  value *emit_literal_string(const std::string &str, const token *tok);
+  value *emit_string_copy(value *dest, int ofs, value *src, bool zero_pad = false);
 
   // Used for passing long and string arguments on the stack where an address is expected:
   void emit_long_arg(value *arg, int ofs, value *val);
@@ -552,172 +565,604 @@ bpf_unparser::visit_block (::block *s)
     emit_stmt (s->statements[i]);
 }
 
+/* WORK IN PROGRESS: A simple eBPF assembler.
+
+   In order to effectively write eBPF tapset functions, we want to use
+   embedded-code assembly rather than compile from SystemTap code. At
+   the same time, we want to hook into stapbpf functionality to
+   reserve stack memory, allocate virtual registers or signal errors.
+
+   The assembler syntax will probably take a couple of attempts to get
+   just right. This attempt keeps things as close as possible to the
+   first embedded-code assembler, with a few more features and a
+   disgustingly lenient parser that allows things like
+     $ this is        all one "**identifier**" believe-it!-or-not
+
+   Ahh for the days of 1960s FORTRAN.
+
+   TODO: It might make more sense to implement an assembler based on
+   the syntax used in official eBPF subsystem docs. */
+
+/* Possible assembly statement types include:
+
+   <stmt> ::= label, <dest=label>;
+   <stmt> ::= <code=integer opcode>, <dest=reg>, <src1=reg>,
+              <off/jmp_target=off>, <imm=imm>;
+
+   Possible argument types include:
+
+   <reg> ::= <register index> | r<register index> |
+             $<identifier> | $<integer constant> | $$ | <string constant>
+   <imm> ::= <integer constant> | BPF_MAXSTRINGLEN
+   <off> ::= <imm> | <jump label>
+
+*/
+
+struct asm_stmt {
+  std::string kind;
+
+  unsigned code;
+  std::string dest, src1;
+  int64_t off, imm;
+
+  // metadata for jmp instructions
+  bool has_fallthrough = false;
+  std::string jmp_target, fallthrough;
+
+  token *tok;
+  bool deallocate_tok = false;
+  ~asm_stmt() { if (deallocate_tok) delete tok; }
+};
+
+std::ostream&
+operator << (std::ostream& o, const asm_stmt& stmt)
+{
+  if (stmt.kind == "label")
+    o << "label, " << stmt.dest << ";";
+  else if (stmt.kind == "opcode")
+    {
+      o << std::hex << stmt.code << ", "
+        << stmt.dest << ", "
+        << stmt.src1 << ", ";
+      if (stmt.off != 0 || stmt.jmp_target == "")
+        o << stmt.off;
+      else if (stmt.off != 0) // && stmt.jmp_target != ""
+        o << stmt.off << "/";
+      if (stmt.jmp_target != "")
+        o << "label:" << stmt.jmp_target;
+      o << ", "
+        << stmt.imm << ";"
+        << (stmt.has_fallthrough ? " +FALLTHROUGH " + stmt.fallthrough : "");
+    }
+  else
+    o << "<unknown asm_stmt kind '" << stmt.kind << "'>";
+  return o;
+}
+
+bool
+is_numeric (const std::string &str)
+{
+  size_t pos = 0;
+  try {
+    stol(str, &pos, 0);
+  } catch (std::invalid_argument &e) {
+    return false;
+  }
+  return (pos == str.size());
+}
+
+/* Parse an assembly statement starting from position start in code,
+   then write the output in stmt. Returns a position immediately after
+   the parsed statement. */
+size_t
+bpf_unparser::parse_asm_stmt (embeddedcode *s, size_t start,
+                              /*OUT*/asm_stmt &stmt)
+{
+  const interned_string &code = s->code;
+
+ retry:
+  std::vector<std::string> args;
+  unsigned n = code.size();
+  bool in_comment = false;
+  bool in_string = false;
+
+  // compute token with adjusted source location for diagnostics
+  source_loc adjusted_loc; // TODO: ought to create a proper copy constructor for source_loc
+  adjusted_loc.file = s->tok->location.file;
+  adjusted_loc.line = s->tok->location.line;
+  adjusted_loc.column = s->tok->location.column;
+  for (size_t pos = 0; pos < start && pos < n; pos++)
+    {
+      // TODO: should save adjusted_loc state between parse_asm_stmt invocations; add field?
+      char c = code[pos];
+      if (c == '\n')
+        {
+          adjusted_loc.line++;
+          adjusted_loc.column = 1;
+        }
+      else
+        adjusted_loc.column++;
+    }
+
+  // TODO: As before, parser is extremely non-rigorous and could do
+  // with some tightening in terms of the inputs it accepts.
+  size_t pos;
+  std::string arg = "";
+  for (pos = start; pos < n; pos++)
+  {
+    char c = code[pos];
+    char c2 = pos + 1 < n ? code [pos + 1] : 0;
+    if (isspace(c))
+      continue; // skip
+    else if (in_comment)
+      {
+        if (c == '*' && c2 == '/')
+          ++pos, in_comment = false;
+        // else skip
+      }
+    else if (in_string)
+      {
+        // resulting string will be processed by translate_escapes()
+        if (c == '"')
+          arg.push_back(c), in_string = false; // include quote
+        else if (c == '\\' && c2 == '"')
+          ++pos, arg.push_back(c), arg.push_back(c2);
+        else // accept any char, including whitespace
+          arg.push_back(c);
+      }
+    else if (c == '/' && c2 == '*')
+      ++pos, in_comment = true;
+    else if (c == '"') // found a literal string
+      {
+        // XXX: This allows '"' inside an arg and will treat the
+        // string as a sequence of weird identifier characters.  A
+        // more rigorous parser would error on mixing strings and
+        // regular chars.
+        arg.push_back(c); // include quote
+        in_string = true;
+      }
+    else if (c == ',') // reached end of argument
+      {
+        // XXX: This strips out empty args. A more rigorous parser would error.
+        if (arg != "")
+          args.push_back(arg);
+        arg = "";
+      }
+    else if (c == ';') // reached end of statement
+      {
+        // XXX: This strips out empty args. A more rigorous parser would error.
+        if (arg != "")
+          args.push_back(arg);
+        arg = "";
+        pos++; break;
+      }
+    else // found (we assume) a regular char
+      {
+        // XXX: As before, this strips whitespace within args
+        // (so '$ab', '$ a b' and '$a b' are equivalent).
+        //
+        // A more rigorous parser would track in_arg
+        // and after_arg states and error on whitespace within args.
+        arg.push_back(c);
+      }
+  }
+  // final ';' is optional, so we watch for a trailing arg:
+  if (arg != "") args.push_back(arg);
+
+  // handle the case with no args
+  if (args.empty() && pos >= n)
+    return std::string::npos; // finished parsing
+  else if (args.empty())
+    {
+      // XXX: This skips an empty statement.
+      // A more rigorous parser would error.
+      start = pos;
+      goto retry;
+    }
+
+  // set token with adjusted source location
+  //stmt.tok = (token *)s->tok;
+  // TODO this segfaults for some reason, some data not copied?
+  stmt.tok = s->tok->adjust_location(adjusted_loc);
+  stmt.deallocate_tok = false; // TODO must avoid destroy-on-copy
+
+  std::cerr << "DEBUG GOT stmt "; // TODO
+  for (unsigned k = 0; k < args.size(); k++) std::cerr << args[k] << " / ";
+  std::cerr << std::endl; // TODO
+  if (args[0] == "label")
+    {
+      if (args.size() != 2)
+        throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), stmt.tok);
+      stmt.kind = args[0];
+      stmt.dest = args[1];
+    }
+  else if (is_numeric(args[0]))
+    {
+      if (args.size() != 5) // TODO change to 4 to test err+tok
+        throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), stmt.tok);
+      stmt.kind = "opcode";
+      stmt.code = stoul(args[0], 0, 0); // TODO signal error
+      stmt.dest = args[1];
+      stmt.src1 = args[2];
+
+      bool has_jmp_target =
+        BPF_CLASS(stmt.code) == BPF_JMP
+        && BPF_OP(stmt.code) != BPF_EXIT
+        && BPF_OP(stmt.code) != BPF_CALL;
+      stmt.has_fallthrough = // only for jcond
+        has_jmp_target
+        && BPF_OP(stmt.code) != BPF_JA;
+      // XXX: stmt.fallthrough is computed by visit_embeddedcode
+
+      if (has_jmp_target)
+        {
+          stmt.off = 0;
+          stmt.jmp_target = args[3];
+        }
+      else if (args[3] == "BPF_MAXSTRINGLEN")
+        stmt.off = BPF_MAXSTRINGLEN;
+      else if (args[3] == "-")
+        stmt.off = 0;
+      else
+        stmt.off = stol(args[3]); // TODO signal error
+
+      if (args[4] == "BPF_MAXSTRINGLEN")
+        stmt.imm = BPF_MAXSTRINGLEN;
+      else if (args[4] == "-")
+        stmt.imm = 0;
+      else
+        stmt.imm = stol(args[4]); // TODO signal error
+    }
+  else
+    throw SEMANTIC_ERROR (_F("unknown bpf embeddedcode operator '%s'",
+                             args[0].c_str()), stmt.tok);
+
+  // we returned a statement, so there's more parsing to be done
+  return pos;
+}
+
+/* forward declaration */
+std::string translate_escapes (const interned_string &str);
+
+/* Convert a <reg> or <imm> operand to a value.
+   May emit code to store a string constant on the stack. */
 value *
-bpf_unparser::parse_reg(const std::string &str, embeddedcode *s)
+bpf_unparser::emit_asm_arg (const asm_stmt &stmt, const std::string &arg,
+                            bool allow_imm)
 {
-  if (str == "$$")
+  if (arg == "$$")
     {
-      if (func_return.empty ())
-	throw SEMANTIC_ERROR (_("no return value outside function"), s->tok);
+      /* arg is a return value */
+      if (func_return.empty())
+        throw SEMANTIC_ERROR (_("no return value outside function"), stmt.tok);
       return func_return_val.back();
     }
-  else if (str[0] == '$')
+  else if (arg[0] == '$')
     {
-      std::string var = str.substr(1);
+      /* assume arg is a variable */
+      std::string var = arg.substr(1);
       for (auto i = this_locals->begin(); i != this_locals->end(); ++i)
 	{
 	  vardecl *v = i->first;
 	  if (var == v->unmangled_name)
 	    return i->second;
 	}
-      throw SEMANTIC_ERROR (_("unknown variable"), s->tok);
+
+      /* if it's an unknown variable, allocate a temporary */
+      struct vardecl *vd = new vardecl;
+      vd->name = "__bpfasm__local_" + var;
+      vd->unmangled_name = var;
+      vd->type = pe_long;
+      vd->arity = 0;
+      value *reg = this_prog.new_reg();
+      const locals_map::value_type v (vd, reg);
+      auto ok = this_locals->insert (v);
+      assert (ok.second);
+      return reg;
+      // TODO write a testcase
     }
-  else
+  else if (is_numeric(arg) && allow_imm)
     {
-      unsigned long num = stoul(str, 0, 0);
+      /* arg is an immediate constant */
+      long imm = stol(arg, 0, 0);
+      return this_prog.new_imm(imm);
+    }
+  else if (is_numeric(arg) || arg[0] == 'r')
+    {
+      /* arg is a register number */
+      std::string reg = arg[0] == 'r' ? arg.substr(1) : arg;
+      unsigned long num = stoul(reg, 0, 0);
       if (num > 10)
-	throw SEMANTIC_ERROR (_("invalid bpf register"), s->tok);
+	throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
+                                 arg.c_str()), stmt.tok);
       return this_prog.lookup_reg(num);
     }
+  else if (arg[0] == '"')
+    {
+      // TODO verify correctness
+      /* arg is a string constant */
+      if (arg[arg.size() - 1] != '"')
+        throw SEMANTIC_ERROR (_F("BUG: improper string %s",
+                                 arg.c_str()), stmt.tok);
+      std::string escaped_str = arg.substr(1,arg.size()-2); /* strip quotes */
+      std::string str = translate_escapes(escaped_str); // TODO interned_str?
+      return emit_literal_string(str, stmt.tok);
+    }
+  else if (arg == "BPF_MAXSTRINGLEN")
+    {
+      /* arg is BPF_MAXSTRINGLEN */
+      if (!allow_imm)
+        throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
+                                 arg.c_str()), stmt.tok);
+      return this_prog.new_imm(BPF_MAXSTRINGLEN);
+    }
+  else if (arg == "-")
+    {
+      /* arg is null a.k.a '0' */
+      if (!allow_imm)
+        throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
+                                 arg.c_str()), stmt.tok);
+      return this_prog.new_imm(0);
+    }
+  else if (allow_imm)
+    throw SEMANTIC_ERROR (_F("invalid bpf argument '%s'",
+                             arg.c_str()), stmt.tok);
+  else
+    throw SEMANTIC_ERROR (_F("invalid bpf register '%s'",
+                             arg.c_str()), stmt.tok);
+  
+}
+
+value *
+bpf_unparser::emit_asm_reg (const asm_stmt &stmt, const std::string &reg)
+{
+  return emit_asm_arg(stmt, reg, /*allow_imm=*/false);
 }
 
 void
-bpf_unparser::visit_embeddedcode (embeddedcode *s)
+bpf_unparser::emit_asm_opcode (const asm_stmt &stmt,
+                               std::map<std::string, block *> label_map)
 {
-  std::string strip;
-  {
-    const interned_string &code = s->code;
-    unsigned n = code.size();
-    bool in_comment = false;
+  if (stmt.code > 0xff && stmt.code != BPF_LD_MAP)
+    throw SEMANTIC_ERROR (_("invalid bpf code"), stmt.tok);
 
-    for (unsigned i = 0; i < n; ++i)
-      {
-	char c = code[i];
-	if (isspace(c))
-	  continue;
-	if (in_comment)
-	  {
-	    if (c == '*' && code[i + 1] == '/')
-	      ++i, in_comment = false;
-	  }
-	else if (c == '/' && code[i + 1] == '*')
-	  ++i, in_comment = true;
-	else
-	  strip += c;
-      }
-  }
+  bool r_dest = false, r_src0 = false, r_src1 = false, i_src1 = false;
+  bool op_jmp = false, op_jcond = false; condition c;
+  switch (BPF_CLASS (stmt.code))
+    {
+    case BPF_LDX:
+      r_dest = r_src1 = true;
+      break;
+    case BPF_STX:
+      r_src0 = r_src1 = true;
+      break;
+    case BPF_ST:
+      r_src0 = i_src1 = true;
+      break;
+
+    case BPF_ALU:
+    case BPF_ALU64:
+      r_dest = true;
+      if (stmt.code & BPF_X)
+        r_src1 = true;
+      else
+        i_src1 = true;
+      switch (BPF_OP (stmt.code))
+        {
+        case BPF_NEG:
+        case BPF_MOV:
+          break;
+        case BPF_END:
+          /* X/K bit repurposed as LE/BE.  */
+          i_src1 = false, r_src1 = true;
+          break;
+        default:
+          r_src0 = true;
+        }
+      break;
+
+    case BPF_JMP:
+      switch (BPF_OP (stmt.code))
+        {
+        case BPF_EXIT:
+          // no special treatment needed
+          break;
+        case BPF_CALL:
+          i_src1 = true;
+          break;
+        case BPF_JA:
+          op_jmp = true;
+          break;
+        default:
+          // XXX: assume this is a jcond op
+          op_jcond = true;
+          r_src0 = true;
+          if (stmt.code & BPF_X)
+            r_src1 = true;
+          else
+            i_src1 = true;
+        }
+
+      // compute jump condition c
+      switch (BPF_OP (stmt.code))
+        {
+        case BPF_JEQ: c = EQ; break;
+        case BPF_JNE: c = NE; break;
+        case BPF_JGT: c = GTU; break;
+        case BPF_JGE: c = GEU; break;
+        case BPF_JLT: c = LTU; break;
+        case BPF_JLE: c = LEU; break;
+        case BPF_JSGT: c = GT; break;
+        case BPF_JSGE: c = GE; break;
+        case BPF_JSLT: c = LT; break;
+        case BPF_JSLE: c = LE; break;
+        case BPF_JSET: c = TEST; break;
+        default:
+          if (op_jcond)
+            throw SEMANTIC_ERROR (_("invalid branch in bpf code"), stmt.tok);
+        }
+      break;
+
+    default:
+      if (stmt.code == BPF_LD_MAP)
+        r_dest = true, i_src1 = true;
+      else
+        throw SEMANTIC_ERROR (_F("unknown opcode '%d' in bpf code",
+                                stmt.code), stmt.tok);
+    }
 
-  std::istringstream ii (strip);
-  ii >> std::setbase(0);
+  value *v_dest = NULL;
+  if (r_dest || r_src0)
+    v_dest = emit_asm_reg(stmt, stmt.dest);
+  else if (stmt.dest != "0" && stmt.dest != "-")
+    throw SEMANTIC_ERROR (_F("invalid register field '%s' in bpf code",
+                             stmt.dest.c_str()), stmt.tok);
 
-  while (true)
+  value *v_src1 = NULL;
+  if (r_src1)
+    v_src1 = emit_asm_reg(stmt, stmt.src1);
+  else
     {
-      unsigned code;
-      char s1, s2, s3, s4;
-      char dest_b[256], src1_b[256];
-      int64_t off, imm;
+      if (stmt.src1 != "0" && stmt.src1 != "-")
+        throw SEMANTIC_ERROR (_F("invalid register field '%s' in bpf code",
+                                 stmt.src1.c_str()), stmt.tok);
+      if (i_src1)
+        v_src1 = this_prog.new_imm(stmt.imm);
+      else if (stmt.imm != 0)
+        throw SEMANTIC_ERROR (_("invalid immediate field in bpf code"), stmt.tok);
+    }
 
-      ii >> code >> s1;
-      ii.get(dest_b, sizeof(dest_b), ',') >> s2;
-      ii.get(src1_b, sizeof(src1_b), ',') >> s3;
-      ii >> off >> s4 >> imm;
+  if (stmt.off != (int16_t)stmt.off)
+    throw SEMANTIC_ERROR (_F("offset field '%ld' out of range in bpf code", stmt.off), stmt.tok);
 
-      if (ii.fail() || s1 != ',' || s2 != ',' || s3 != ',' || s4 != ',')
-	throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), s->tok);
+  if (op_jmp)
+    {
+      block *target = label_map[stmt.jmp_target];
+      this_prog.mk_jmp(this_ins, target);
+    }
+  else if (op_jcond)
+    {
+      if (label_map.count(stmt.jmp_target) == 0)
+        throw SEMANTIC_ERROR(_F("undefined jump target '%s' in bpf code",
+                                stmt.jmp_target.c_str()), stmt.tok);
+      if (label_map.count(stmt.fallthrough) == 0)
+        throw SEMANTIC_ERROR(_F("BUG: undefined fallthrough target '%s'",
+                                stmt.fallthrough.c_str()), stmt.tok);
+      block *target = label_map[stmt.jmp_target];
+      block *fallthrough = label_map[stmt.fallthrough];
+      this_prog.mk_jcond(this_ins, c, v_dest, v_src1, target, fallthrough);
+    }
+  else // regular opcode
+    {
+      insn *i = this_ins.new_insn();
+      i->code = stmt.code;
+      i->dest = (r_dest ? v_dest : NULL);
+      i->src0 = (r_src0 ? v_dest : NULL);
+      i->src1 = v_src1;
+      i->off = stmt.off;
+    }
+}
 
-      if (code > 0xff && code != BPF_LD_MAP)
-	throw SEMANTIC_ERROR (_("invalid bpf code"), s->tok);
+void
+bpf_unparser::visit_embeddedcode (embeddedcode *s)
+{
+  std::vector<asm_stmt> statements;
+  asm_stmt stmt;
 
-      bool r_dest = false, r_src0 = false, r_src1 = false, i_src1 = false;
-      switch (BPF_CLASS (code))
-	{
-	case BPF_LDX:
-	  r_dest = r_src1 = true;
-	  break;
-	case BPF_STX:
-	  r_src0 = r_src1 = true;
-	  break;
-	case BPF_ST:
-	  r_src0 = i_src1 = true;
-	  break;
+  size_t pos = 0;
+  while ((pos = parse_asm_stmt(s, pos, stmt)) != std::string::npos)
+    {
+      statements.push_back(stmt);
+    }
 
-	case BPF_ALU:
-	case BPF_ALU64:
-	  r_dest = true;
-	  if (code & BPF_X)
-	    r_src1 = true;
-	  else
-	    i_src1 = true;
-	  switch (BPF_OP (code))
-	    {
-	    case BPF_NEG:
-	    case BPF_MOV:
-	      break;
-	    case BPF_END:
-	      /* X/K bit repurposed as LE/BE.  */
-	      i_src1 = false, r_src1 = true;
-	      break;
-	    default:
-	      r_src0 = true;
-	    }
-	  break;
+  // build basic block table
+  std::map<std::string, block *> label_map;
+  block *entry_block = this_ins.b;
+  label_map[";;entry"] = entry_block;
 
-	case BPF_JMP:
-	  switch (BPF_OP (code))
-	    {
-	    case BPF_EXIT:
-	      break;
-	    case BPF_CALL:
-	      i_src1 = true;
-	      break;
-	    default:
-	      throw SEMANTIC_ERROR (_("invalid branch in bpf code"), s->tok);
-	    }
-	  break;
+  bool after_label = true;
+  asm_stmt *after_jump = NULL;
+  unsigned fallthrough_count = 0;
+  for (std::vector<asm_stmt>::iterator it = statements.begin();
+       it != statements.end(); it++)
+    {
+      stmt = *it;
 
-	default:
-          if (code == BPF_LD_MAP)
-            r_dest = true, i_src1 = true;
-          else
-	    throw SEMANTIC_ERROR (_("unknown opcode in bpf code"), s->tok);
-	}
+      if (after_jump != NULL && stmt.kind == "label")
+        {
+          after_jump->fallthrough = stmt.dest;
+        }
+      else if (after_jump != NULL)
+        {
+          block *b = this_prog.new_block();
 
-      std::string dest(dest_b);
-      value *v_dest = NULL;
-      if (r_dest || r_src0)
-	v_dest = parse_reg(dest, s);
-      else if (dest != "0")
-	throw SEMANTIC_ERROR (_("invalid register field in bpf code"), s->tok);
-
-      std::string src1(src1_b);
-      value *v_src1 = NULL;
-      if (r_src1)
-	v_src1 = parse_reg(src1, s);
-      else
-	{
-	  if (src1 != "0")
-	    throw SEMANTIC_ERROR (_("invalid register field in bpf code"), s->tok);
-	  if (i_src1)
-	    v_src1 = this_prog.new_imm(imm);
-	  else if (imm != 0)
-	    throw SEMANTIC_ERROR (_("invalid immediate field in bpf code"), s->tok);
-	}
+          // generate unique label for fallthrough edge
+          std::ostringstream oss;
+          oss << "fallthrough;;" << fallthrough_count++;
+          std::string fallthrough_label = oss.str();
+          // XXX: semicolons prevent collision with programmer-defined labels
 
-      if (off != (int16_t)off)
-	throw SEMANTIC_ERROR (_("offset field out of range in bpf code"), s->tok);
+          label_map[fallthrough_label] = b;
+          set_block(b);
 
-      insn *i = this_ins.new_insn();
-      i->code = code;
-      i->dest = (r_dest ? v_dest : NULL);
-      i->src0 = (r_src0 ? v_dest : NULL);
-      i->src1 = v_src1;
-      i->off = off;
+          after_jump->fallthrough = fallthrough_label;
+        }
 
-      ii >> s1;
-      if (ii.eof())
-	break;
-      if (s1 != ';')
-	throw SEMANTIC_ERROR (_("invalid bpf embeddedcode syntax"), s->tok);
+      if (stmt.kind == "label" && after_label)
+        {
+          // avoid creating multiple blocks for consecutive labels
+          label_map[stmt.dest] = this_ins.b;
+          after_jump = NULL;
+        }
+      else if (stmt.kind == "label")
+        {
+          block *b = this_prog.new_block();
+          label_map[stmt.dest] = b;
+          set_block(b);
+          after_label = true;
+          after_jump = NULL;
+        }
+      else if (stmt.has_fallthrough)
+        {
+          after_label = false;
+          after_jump = &*it; // be sure to refer to original, not copied stmt
+        }
+      else
+        {
+          after_label = false;
+          after_jump = NULL;
+        }
+    }
+  if (after_jump != NULL) // TODO: should just fall through to exit
+    throw SEMANTIC_ERROR (_("BUG: bpf embeddedcode doesn't support "
+                            "fallthrough on final asm_stmt"), stmt.tok);
+
+  // emit statements
+  bool jumped_already = true;
+  set_block(entry_block);
+  for (std::vector<asm_stmt>::iterator it = statements.begin();
+       it != statements.end(); it++)
+    {
+      stmt = *it;
+      std::cerr << "DEBUG processing " << stmt << std::endl; // TODO
+      if (stmt.kind == "label")
+        {
+          // TODO: be sure there's no gap in the edge
+          if (!jumped_already)
+            emit_jmp (label_map[stmt.dest]);
+          set_block(label_map[stmt.dest]);
+        }
+      else if (stmt.kind == "opcode")
+        {
+          emit_asm_opcode (stmt, label_map);
+        }
+      else
+        throw SEMANTIC_ERROR (_F("BUG: bpf embeddedcode contains unexpected "
+                                 "asm_stmt kind '%s'", stmt.kind.c_str()),
+                              stmt.tok);
+      jumped_already = stmt.has_fallthrough;
+      if (stmt.has_fallthrough)
+        set_block(label_map[stmt.fallthrough]);
     }
 }
 
@@ -1016,8 +1461,13 @@ bpf_unparser::visit_delete_statement (delete_statement *s)
 }
 
 // Translate string escape characters.
+// Accepts strings produced by parse.cxx lexer::scan and
+// by the eBPF embedded-code assembler.
+//
+// PR23559: This is currently an eBPF-only version of the function
+// that does not translate octal escapes.
 std::string
-translate_escapes (interned_string &str)
+translate_escapes (const interned_string &str)
 {
   std::string result;
   bool saw_esc = false;
@@ -1045,16 +1495,21 @@ translate_escapes (interned_string &str)
   return result;
 }
 
+value *
+bpf_unparser::emit_literal_string (const std::string &str, const token *tok)
+{
+  size_t str_bytes = str.size() + 1;
+  if (str_bytes > BPF_MAXSTRINGLEN)
+    throw SEMANTIC_ERROR(_("string literal too long"), tok);
+  return this_prog.new_str(str); // will be lowered to a pointer by bpf-opt.cxx
+}
+
 void
 bpf_unparser::visit_literal_string (literal_string* e)
 {
   interned_string v = e->value;
   std::string str = translate_escapes(v);
-
-  size_t str_bytes = str.size() + 1;
-  if (str_bytes > BPF_MAXSTRINGLEN)
-    throw SEMANTIC_ERROR(_("String literal too long"), e->tok);
-  result = this_prog.new_str(str); // will be lowered to a pointer by bpf-opt.cxx
+  result = emit_literal_string(str, e->tok);
 }
 
 void
@@ -1783,7 +2238,7 @@ bpf_unparser::visit_target_register (target_register* e)
 // ??? Could use 8-byte chunks if we're starved for instruction count.
 // ??? Endianness of the target comes into play here.
 value *
-emit_literal_str(program &this_prog, insn_inserter &this_ins,
+emit_simple_literal_str(program &this_prog, insn_inserter &this_ins,
                  value *dest, int ofs, std::string &src, bool zero_pad)
 {
   size_t str_bytes = src.size() + 1;
@@ -1835,15 +2290,15 @@ emit_literal_str(program &this_prog, insn_inserter &this_ins,
 // ??? Could use 8-byte chunks if we're starved for instruction count.
 // ??? Endianness of the target may come into play here.
 value *
-bpf_unparser::emit_copied_str(value *dest, int ofs, value *src, bool zero_pad)
+bpf_unparser::emit_string_copy(value *dest, int ofs, value *src, bool zero_pad)
 {
   if (src->is_str())
     {
       /* If src is a string literal, its exact length is known and
          we can emit simpler, unconditional string copying code. */
       std::string str = src->str();
-      return emit_literal_str(this_prog, this_ins,
-                              dest, ofs, str, zero_pad);
+      return emit_simple_literal_str(this_prog, this_ins,
+                                     dest, ofs, str, zero_pad);
     }
 
   size_t str_bytes = BPF_MAXSTRINGLEN;
@@ -1931,7 +2386,7 @@ bpf_unparser::emit_copied_str(value *dest, int ofs, value *src, bool zero_pad)
     }
 
   // XXX: Zero-padding is only used under specific circumstances;
-  // see the corresponding comment in emit_literal_str().
+  // see the corresponding comment in emit_simple_literal_str().
   if (zero_pad)
     {
       for (unsigned i = 0; i < str_words; ++i)
@@ -1977,7 +2432,7 @@ void
 bpf_unparser::emit_str_arg(value *arg, int ofs, value *str)
 {
   value *frame = this_prog.lookup_reg(BPF_REG_10);
-  value *out = emit_copied_str(frame, ofs, str, true /* zero pad */);
+  value *out = emit_string_copy(frame, ofs, str, true /* zero pad */);
   emit_mov(arg, out);
 }
 
diff --git a/parse.h b/parse.h
index 42b0bc5fd..96aef0394 100644
--- a/parse.h
+++ b/parse.h
@@ -65,11 +65,25 @@ struct token
   token_junk_type junk_type;
 
   std::string junk_message(systemtap_session& session) const;
+
+  // Creates a new token with the same content but different coordinates.
+  // Can be used for exact error reporting *within* a token e.g. embedded-code.
+  token *adjust_location(const source_loc &adjusted_loc) const
+  { // TODO split from header
+    token *new_tok = new token;
+    new_tok->location = adjusted_loc;
+    new_tok->content = content;
+    new_tok->chain = chain;
+    new_tok->type = type;
+    new_tok->junk_type = junk_type;
+    return new_tok;
+  }
   
   friend class parser;
   friend class lexer;
 private:
   void make_junk (token_junk_type);
+
   token(): chain(0), type(tok_junk), junk_type(tok_junk_unknown) {}
   token(const token& other):
     location(other.location), content(other.content),
-- 
2.14.5