/*
Copyright (c) 2013. The YARA Authors. All Rights Reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* Lexical analyzer for regular expressions */

%{

/* Disable warnings for unused functions in this file.

As we redefine YY_FATAL_ERROR macro to use our own function re_yyfatal, the
yy_fatal_error function generated by Flex is not actually used, causing a
compiler warning. Flex doesn't offer any options to remove the yy_fatal_error
function. When they include something like %option noyy_fatal_error as they do
with noyywrap then we can remove this pragma.
*/

#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wunused-function"
#endif

#include <assert.h>
#include <setjmp.h>

#include <yara/globals.h>
#include <yara/utils.h>
#include <yara/error.h>
#include <yara/limits.h>
#include <yara/mem.h>
#include <yara/re.h>
#include <yara/re_lexer.h>
#include <yara/threading.h>
#include <yara/strutils.h>


#ifdef _WIN32
#define snprintf _snprintf
#endif

// Bitmap with 1 bit for each of the 256 characters in the ASCII table. The bit
// is set to 1 if the corresponding character is alphanumeric or 0 if otherwise.
static uint8_t word_chars[] = {
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
    0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };

// Bitmap with 1 bit for each of the 256 characters in the ASCII table. The bit
// is set to 1 if the corresponding character is considered a space. Space
// characters include horizontal and vertical tabs, carriage return, new line
// and form feed (\t, \v, \r, \n, \f).
static uint8_t space_chars[] = {
    0x00, 0x3E, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };

int escaped_char_value(
    char* text,
    uint8_t* value);

int read_escaped_char(
    yyscan_t yyscanner,
    uint8_t* escaped_char);

%}

%option reentrant bison-bridge
%option noyywrap
%option nounistd
%option nounput
%option never-interactive
%option yylineno
%option prefix="re_yy"

%option outfile="lex.yy.c"

%option verbose
%option warn

%x char_class

digit         [0-9]
hex_digit     [0-9a-fA-F]

%%

\{{digit}*,{digit}*\} {

  // Examples: {3,8} {0,5} {,5} {7,}

  int hi_bound;
  int lo_bound = atoi(yytext + 1);

  char* comma = strchr(yytext, ',');

  if (comma - yytext == strlen(yytext) - 2)
    // if comma is followed by the closing curly bracket
    // (example: {2,}) set high bound value to maximum.
    hi_bound = RE_MAX_RANGE;
  else
    hi_bound = atoi(comma + 1);

  if (hi_bound > RE_MAX_RANGE)
  {
    yyerror(yyscanner, lex_env, "repeat interval too large");
    yyterminate();
  }

  if (hi_bound < lo_bound || hi_bound < 0 || lo_bound < 0)
  {
    yyerror(yyscanner, lex_env, "bad repeat interval");
    yyterminate();
  }

  if (hi_bound == 0 && lo_bound == 0)
  {
    yyerror(yyscanner, lex_env, "bad repeat interval");
    yyterminate();
  }

  yylval->range = (hi_bound << 16) | lo_bound;

  return _RANGE_;
}


\{{digit}+\} {

  // Example: {10}

  int value = atoi(yytext + 1);

  // atoi can return a negative value if the input string represents a number
  // too large to fit in an integer.

  if (value > RE_MAX_RANGE || value < 0)
  {
    yyerror(yyscanner, lex_env, "repeat interval too large");
    yyterminate();
  }

  if (value == 0)
  {
    yyerror(yyscanner, lex_env, "bad repeat interval");
    yyterminate();
  }

  yylval->range = (value << 16) | value;

  return _RANGE_;
}


\[\^ {

  // Start of a negated character class. Example: [^abcd]

  BEGIN(char_class);
  memset(LEX_ENV->re_class.bitmap, 0, 32);
  LEX_ENV->re_class.negated = true;
}

\[\^\] {

  // Start of character negated class containing a ].
  // Example: [^]abc] this must be interpreted as a class
  // not matching ], a, b, nor c

  BEGIN(char_class);
  memset(LEX_ENV->re_class.bitmap, 0, 32);
  LEX_ENV->re_class.negated = true;
  LEX_ENV->re_class.bitmap[']' / 8] |= 1 << ']' % 8;
}


\[\] {

  // Start of character class containing a ].
  // Example: []abc] this must be interpreted as a class
  // matching ], a, b, or c.

  BEGIN(char_class);
  memset(LEX_ENV->re_class.bitmap, 0, 32);
  LEX_ENV->re_class.negated = false;
  LEX_ENV->re_class.bitmap[']' / 8] |= 1 << ']' % 8;
}


\[ {

  // Start of character class. Example: [abcd]

  BEGIN(char_class);
  memset(LEX_ENV->re_class.bitmap, 0, 32);
  LEX_ENV->re_class.negated = false;
}


[^\\\[\(\)\|\$\.\^\+\*\?] {

  // Any non-special character is passed as a CHAR token to the scanner.

  yylval->integer = yytext[0];
  return _CHAR_;
}


\\w {
  return _WORD_CHAR_;
}


\\W {
  return _NON_WORD_CHAR_;
}


\\s {
  return _SPACE_;
}


\\S {
  return _NON_SPACE_;
}


\\d {
  return _DIGIT_;
}


\\D {
  return _NON_DIGIT_;
}


\\b {
  return _WORD_BOUNDARY_;
}

\\B {
  return _NON_WORD_BOUNDARY_;
}


\\{digit}+ {

  yyerror(yyscanner, lex_env, "backreferences are not allowed");
  yyterminate();
}


\\ {

  uint8_t c;

  if (read_escaped_char(yyscanner, &c))
  {
    yylval->integer = c;
    return _CHAR_;
  }
  else
  {
    yyerror(yyscanner, lex_env, "illegal escape sequence");
    yyterminate();
  }
}


<char_class>\] {

  // End of character class.
  yylval->re_class = (RE_CLASS*) yr_malloc(sizeof(RE_CLASS));
  memcpy(yylval->re_class->bitmap, LEX_ENV->re_class.bitmap, 32);

  yylval->re_class->negated = LEX_ENV->re_class.negated;

  BEGIN(INITIAL);
  return _CLASS_;
}



<char_class>(\\x{hex_digit}{2}|\\.|[^]\\])-[^]] {

  // A range inside a character class. The regexp is...
  //
  //   ( \x{hex_digit}{2}    Hex digit (i.e: \x01) ...
  //   | \.                  ...or any escaped character (i.e. \\, \-) ...
  //   | [^]\]               ...or any character except ] and \ ...
  //   )
  //   -                     ... followed by -
  //   [^]]                  ... followed by any character except ]
  //
  // Some examples:
  //
  //  [abc0-9]
  //      ^-^ matching range 0-9
  //
  //  [a-za-]
  //   ^-^- matching range a-z
  //
  //  [\.-a]
  //   ^--^- matching range \.-a
  //

  uint16_t c;
  uint8_t start = yytext[0];
  uint8_t end = yytext[2];

  if (start == '\\')
  {
    if (!escaped_char_value(yytext, &start))
    {
      yyerror(yyscanner, lex_env, "illegal escape sequence");
      yyterminate();
    }

    if (yytext[1] == 'x')
      end = yytext[5];
    else
      end = yytext[3];
  }

  if (end == '\\')
  {
    if (!read_escaped_char(yyscanner, &end))
    {
      yyerror(yyscanner, lex_env, "illegal escape sequence");
      yyterminate();
    }
  }

  if (end < start)
  {
    yyerror(yyscanner, lex_env, "bad character range");
    yyterminate();
  }

  for (c = start; c <= end; c++)
  {
    LEX_ENV->re_class.bitmap[c / 8] |= 1 << c % 8;
  }
}


<char_class>\\w {

  for (int i = 0; i < 32; i++)
    LEX_ENV->re_class.bitmap[i] |= word_chars[i];
}


<char_class>\\W {

  for (int i = 0; i < 32; i++)
    LEX_ENV->re_class.bitmap[i] |= ~word_chars[i];
}


<char_class>\\s {

  for (int i = 0; i < 32; i++)
    LEX_ENV->re_class.bitmap[i] |= space_chars[i];
}


<char_class>\\S {

  for (int i = 0; i < 32; i++)
    LEX_ENV->re_class.bitmap[i] |= ~space_chars[i];
}


<char_class>\\d {

  for (char c = '0'; c <= '9'; c++)
    LEX_ENV->re_class.bitmap[c / 8] |= 1 << c % 8;
}


<char_class>\\D {

  for (int i = 0; i < 32; i++)
  {
    // digits 0-7 are in the sixth byte of the vector, let that byte alone
    if (i == 6)
      continue;

    // digits 8 and 9 are the lowest two bits in the seventh byte of the
    // vector, let those bits alone.
    if (i == 7)
      LEX_ENV->re_class.bitmap[i] |= 0xFC;
    else
      LEX_ENV->re_class.bitmap[i] = 0xFF;
  }
}


<char_class>\\ {

  uint8_t c;

  if (read_escaped_char(yyscanner, &c))
  {
    LEX_ENV->re_class.bitmap[c / 8] |= 1 << c % 8;
  }
  else
  {
    yyerror(yyscanner, lex_env, "illegal escape sequence");
    yyterminate();
  }
}


<char_class>. {

  if (yytext[0] >= 32 && yytext[0] < 127)
  {
    // A character class (i.e: [0-9a-f]) is represented by a 256-bits vector,
    // here we set to 1 the vector's bit corresponding to the input character.

    LEX_ENV->re_class.bitmap[yytext[0] / 8] |= 1 << yytext[0] % 8;
  }
  else
  {
    yyerror(yyscanner, lex_env, "non-ascii character");
    yyterminate();
  }
}


<char_class><<EOF>> {

  // End of regexp reached while scanning a character class.

  yyerror(yyscanner, lex_env, "missing terminating ] for character class");
  yyterminate();
}


. {

  if (yytext[0] >= 32 && yytext[0] < 127)
  {
    return yytext[0];
  }
  else
  {
    yyerror(yyscanner, lex_env, "non-ascii character");
    yyterminate();
  }
}


<<EOF>> {

  yyterminate();
}

%%

int escaped_char_value(
    char* text,
    uint8_t* value)
{
  unsigned int hex_value;
  char hex[3];

  assert(text[0] == '\\');

  switch(text[1])
  {
  case 'x':
    if (!isxdigit(text[2]) || !isxdigit(text[3]))
      return 0;
    hex[0] = text[2];
    hex[1] = text[3];
    hex[2] = '\0';
    sscanf(hex, "%x", &hex_value);
    *value = (uint8_t) hex_value;
    break;

  case 'n':
    *value = '\n';
    break;

  case 't':
    *value = '\t';
    break;

  case 'r':
    *value = '\r';
    break;

  case 'f':
    *value = '\f';
    break;

  case 'a':
    *value = '\a';
    break;

  default:
    *value = text[1];
  }

  return 1;
}


#ifdef __cplusplus
#define RE_YY_INPUT yyinput
#else
#define RE_YY_INPUT input
#endif


int read_escaped_char(
    yyscan_t yyscanner,
    uint8_t* escaped_char)
{
  char text[4] = {0, 0, 0, 0};

  text[0] = '\\';
  text[1] = RE_YY_INPUT(yyscanner);

  if (text[1] == EOF || text[1] == 0)
    return 0;

  if (text[1] == 'x')
  {
    text[2] = RE_YY_INPUT(yyscanner);

    if (text[2] == EOF || text[2] == 0)
      return 0;

    text[3] = RE_YY_INPUT(yyscanner);

    if (text[3] == EOF || text[3] == 0)
      return 0;
  }

  return escaped_char_value(text, escaped_char);
}


//
// yyfatal (actually named re_yyfatal because of the '%option prefix="re_yy"'
// directive) is called when a fatal error occurs in the parser. When this
// happens we are deep inside the parsing logic generated by flex/bison and
// the only way to exit gracefully from there is using setjmp/longjmp.
//
void yyfatal(
    yyscan_t yyscanner,
    const char *error_message)
{
  jmp_buf* recovery_trampoline = (jmp_buf*) yr_thread_storage_get_value(
      &yr_yyfatal_trampoline_tls);

  longjmp(*recovery_trampoline, 1);
}


void yyerror(
    yyscan_t yyscanner,
    RE_LEX_ENVIRONMENT* lex_env,
    const char *error_message)
{
  // if lex_env->last_error was set to some error code before
  // don't overwrite it, we are interested in the first error, not in
  // subsequent errors like "syntax error, unexpected $end" caused by
  // early parser termination.

  if (lex_env->last_error == ERROR_SUCCESS)
  {
    lex_env->last_error = ERROR_INVALID_REGULAR_EXPRESSION;

    strlcpy(
        lex_env->last_error_message,
        error_message,
        sizeof(lex_env->last_error_message));
  }
}


int yr_parse_re_string(
  const char* re_string,
  RE_AST** re_ast,
  RE_ERROR* error)
{
  yyscan_t yyscanner;
  jmp_buf recovery_trampoline;
  RE_LEX_ENVIRONMENT lex_env;

  lex_env.last_error = ERROR_SUCCESS;
  lex_env.last_error_message[0] = '\0';

  yr_thread_storage_set_value(
      &yr_yyfatal_trampoline_tls,
      &recovery_trampoline);

  // setjmp returns a non-zero value only when we are returning to this
  // point via a call to longjmp to the recovery trampoline.
  if (setjmp(recovery_trampoline) != 0)
    return ERROR_INTERNAL_FATAL_ERROR;

  FAIL_ON_ERROR(yr_re_ast_create(re_ast));

  if (yylex_init(&yyscanner) != 0)
  {
    yr_re_ast_destroy(*re_ast);
    *re_ast = NULL;
    return ERROR_INSUFFICIENT_MEMORY;
  }

  yyset_extra(*re_ast, yyscanner);
  yy_scan_string(re_string, yyscanner);
  yyparse(yyscanner, &lex_env);
  yylex_destroy(yyscanner);

  if (lex_env.last_error != ERROR_SUCCESS)
  {
    yr_re_ast_destroy(*re_ast);
    *re_ast = NULL;

    strlcpy(
        error->message,
        lex_env.last_error_message,
        sizeof(error->message));

    return lex_env.last_error;
  }

  return ERROR_SUCCESS;
}
