/* lexer.l -- Lexical analyser of argile base syntax */
/*
 *   Argile programming language compiler
 *
 *   Copyright (C) 2009 the Argile authors
 *
 *   This file is part of ARC Argile compiler.
 *
 *   Argile is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   Argile is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with Argile.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 ******************************************************************************
 *                                                                            *
 *                             S C A N N E R                                  *
 *                                                                            *
 ******************************************************************************
 */

%{
#include <string.h>

#include "argile.h"
#include "parser.h"

#define YY_NO_UNPUT 1
#define YY_NO_INPUT 1

#undef  YY_INPUT
#define YY_INPUT(buf, res, len)  (res = argile_yyinput (buf, len))

#define yylloc argile_yylloc
#define yylval argile_yylval
  
#define YY_USER_ACTION   _input(yytext, yyleng, &yylloc);


  static void*		_memdup   (void *ptr, int len);
  static char*		_yy_dup   ();
  static void		_input    (char *str, int len, YYLTYPE *_yylloc);
  static void		_unescape ();
  static int		_push_pop (int token);
  static argile_buff_t	_cons_str, _cmt_buf;
  static int            _cmt_nest = 0;
%}

%pointer
%option stack
%option never-interactive
%option noyywrap nounput

%x SC_STR SC_CMT
%s SC_SYN SC_PAR SC_BND SC_ENM SC_LST

/* tokens */
INDENT		^[ \t]+
NEWLINE		\r\n|\n
EMPTYLINE	^[ \t]*{NEWLINE}
MULTILINE	\\{NEWLINE}[ \t]*
BLANK		[ \t]+|[\r\n]
STR_BEG		\"
STR_END		\"
CONS_HEX	-?0[xX][0-9a-fA-F]+
CONS_DEC	-?[0-9]+
CONS_OCT	-?0[0-7]+
CONS_BIN	-?0[bB][01]+
CONS_REAL	-?[0-9]+\.[0-9]+([eE][-+]?[0-9]+)?
WORD		([a-zA-Z_]|{UTF8})([0-9a-zA-Z_]|{UTF8})*
UTF8		[\x80-\xff]
 //UTF8		{UTF8_1}|{UTF8_2}|{UTF8_3}|{UTF8_4}|{UTF8_5}
 //UTF8_1		[\xc0-\xdf]{UTF8_VALUE}{1}
 //UTF8_2		[\xe0-\xef]{UTF8_VALUE}{2}
 //UTF8_3		[\xf0-\xf7]{UTF8_VALUE}{3}
 //UTF8_4		[\xf8-\xfb]{UTF8_VALUE}{4}
 //UTF8_5		[\xfc-\xfd]{UTF8_VALUE}{5}
 //UTF8_VALUE	[\x80-\xbf]
 //UTF8_INVALID	[\xfe-\xff]
OPS		[!#$%&\'*+,\-./<=>?@[\\\]^`|~]
SYN_OPS		(\\[|\\<>[\].]|[!#$%&\'*+,\-./=>?@\]^`|~])
ENM_OPS		(\\[|\\<>[\].]|[!#$%&\'*+,\-./=>?@\]^`~])
LST_OPS		(\\[|\\<>[\].]|[!#$%&\'*+,\-./=>?@^`|~])
SYN_BEG		":"
SYN_END		":"
CODE_BEG	"{"
CODE_END	[ \t]*"}"
SUB_BEG		"("
SUB_END		")"
CALL_END	";"
PAR_BEG		"<"
PAR_OPS		(\\[\\<>=]|[!#$%&\'*+\-./?@[\\\]^`|~])
PAR_INIT	"="
PAR_SEP		","
PAR_END		">"
OPT_BEG		"("
OPT_END		")"
ENM_BEG		"{"
ENM_SEP		"|"
ENM_END		"}"
LST_BEG		"["
LST_SEP		"..."
LST_BND		","
LST_END		"]"
LST_CNT		[0-9]+
CMT_BEG		[ \t]*"(:"
CMT_END		":)"

%%

 /*
  * Blanks, and indentation stuffs
  */
<INITIAL>{MULTILINE} {
  /* ignore */
}

<INITIAL>{EMPTYLINE}+ {
  argile.new_line = 1;
  return _push_pop (T_CALL_END);
}

<INITIAL>{INDENT} {
  int n = argile.loc.end.col;
  int top = argile_top_indent ();
  if (top == -3) // multi-line subcall, ignore
    ;
  else if (top < 0) // first indentation in explicit block
    argile_push_indent (n);
  else // relative indentation
    {
      if (n == top) // same indentation
	return _push_pop (T_CALL_END);
      if (n > top) // forth indentation
	{
	  argile_push_indent (n);
	  return _push_pop (T_CODE_BEG);
	}
      while (n < argile_top_indent ()) // back indentation
	{
	  argile_pop_indent ();
	  argile_push_token (T_CODE_END);
	}
      top = argile_top_indent ();
      if (top < 0)
	{
	  argile_die ("indentation mismatch (%d)", n);
	  yyterminate ();
	}
      if (n == top)
	argile_push_token (T_CALL_END);
      return argile_pop_token ();
    }
}

<INITIAL>{NEWLINE} {
  argile.new_line = 1;
}

{BLANK} {
  /* ignore */
}

<INITIAL,SC_CMT,SC_SYN,SC_PAR,SC_ENM,SC_LST>{CMT_BEG} {
  if (_cmt_nest == 0 && yylloc.first_column == 0 && !strcmp(yytext, "(:")) {
    _cmt_nest = 1;
    argile_buff_set (&_cmt_buf, NULL, 0, 0);
  } else if (_cmt_nest) {
    _cmt_nest++;
    if (argile.get_comments)
      argile_buff_add (&_cmt_buf, yytext, yyleng);
  }
  yy_push_state (SC_CMT);
}

<SC_CMT>.|\n {
  if (_cmt_nest && argile.get_comments)
    argile_buff_add (&_cmt_buf, yytext, yyleng);
}

<SC_CMT>{CMT_END} {
  yy_pop_state ();
  if (_cmt_nest == 1) {
    _cmt_nest = 0;
    if (_cmt_buf.ptr) {
      argile_buff_putc (&_cmt_buf, 0);
      argile_add_comment (_cmt_buf.ptr, yylloc.last_line, yylloc.last_column);
    }
    argile_buff_set (&_cmt_buf, NULL, 0, 0);
  } else if (_cmt_nest) {
    _cmt_nest--;
    if (argile.get_comments)
      argile_buff_add (&_cmt_buf, yytext, yyleng);
  }
}

 /*
  * Text constant
  */
<INITIAL,SC_PAR>{STR_BEG} {
  argile_buff_set (&_cons_str, NULL, 0, 0);
  yy_push_state (SC_STR);
}

<SC_STR>\\[0-7]{1,3} {
  int ch;

  sscanf (yytext + 1, "%o", &ch);
  if (ch > 255)
    argile_die ("numeric character value too high");
  argile_buff_putc (&_cons_str, ch);
}

<SC_STR>\\x[0-9a-fA-F]{2} {
  int ch;

  sscanf (yytext + 2, "%x", &ch);
  if (ch > 255)
    argile_die ("numeric character value too high");
  argile_buff_putc (&_cons_str, ch);
}

<SC_STR>\\a {
  argile_buff_putc (&_cons_str, '\a');
}

<SC_STR>\\b {
  argile_buff_putc (&_cons_str, '\b');
}

<SC_STR>\\t {
  argile_buff_putc (&_cons_str, '\t');
}

<SC_STR>\\n {
  argile_buff_putc (&_cons_str, '\n');
}

<SC_STR>\\v {
  argile_buff_putc (&_cons_str, '\v');
}

<SC_STR>\\f {
  argile_buff_putc (&_cons_str, '\f');
}

<SC_STR>\\r {
  argile_buff_putc (&_cons_str, '\r');
}

<SC_STR>\\\" {
  argile_buff_putc (&_cons_str, '"');
}

<SC_STR>\\\\ {
  argile_buff_putc (&_cons_str, '\\');
}

<SC_STR>\\ {
  argile_buff_putc (&_cons_str, '\\');
}

<SC_STR>[^\\\"]+ {
  argile_buff_add (&_cons_str, yytext, yyleng);
}

<SC_STR>{STR_END} {
  yy_pop_state ();
  //argile_buff_putc (&_cons_str, 0);
  //yylval.text = _cons_str.ptr;
  yylval.text = argile_text_new (_cons_str.ptr, _cons_str.len);
  argile_add_leak (yylval.text);
  argile_add_leak (yylval.text->str);
  if (_cons_str.ptr)
    argile_dbg_free (_cons_str.ptr);
  argile_buff_set (&_cons_str, NULL, 0, 0);
  return _push_pop (T_TEXT);
}

 /*
  * Integer constant
  */
<INITIAL,SC_PAR>{CONS_BIN} {
  yylval.string = _yy_dup ();
  return _push_pop (T_INT_BIN);
}

<INITIAL,SC_PAR>{CONS_OCT} {
  yylval.string = _yy_dup ();
  return _push_pop (T_INT_OCT);
}

<INITIAL,SC_PAR>{CONS_DEC} {
  yylval.string = _yy_dup ();
  return _push_pop (T_INT_DEC);
}

<INITIAL,SC_PAR>{CONS_HEX} {
  yylval.string = _yy_dup ();
  return _push_pop (T_INT_HEX);
}

 /*
  * Real constant
  */
<INITIAL,SC_PAR>{CONS_REAL} {
  yylval.string = _yy_dup ();
  return _push_pop (T_REAL);
}

 /*
  * Word
  */
<INITIAL,SC_SYN,SC_PAR,SC_ENM,SC_LST>{WORD} {
  if (argile.op_ranges)
  {
    int token = T_OP;
    switch (YY_START)
    {
    case SC_SYN:
    case SC_ENM:
    case SC_LST:
      token = T_SYN_OP;
      break;
    case SC_PAR:
      token = T_PAR_OP;
      break;
    default:
      break;
    }
    return argile_split_ops (token, yytext, yyleng, &yylval.string);
  }
  yylval.string = _yy_dup ();
  return _push_pop (T_WORD);
}

 /*
  * Operator
  */
<INITIAL>{OPS} {
  yylval.string = _yy_dup ();
  return _push_pop (T_OP);
}

<SC_PAR>{PAR_OPS} {
  _unescape ();
  yylval.string = _yy_dup ();
  return T_PAR_OP;
}

 /*
  * Subcall delimiters
  */
<INITIAL,SC_PAR>{SUB_BEG} {
  argile_push_indent (-3);
  yy_push_state (INITIAL);
  return _push_pop (T_SUB_BEG);
}

<INITIAL>{SUB_END} {
  int n, count;

  for (count = 0; (n = argile_pop_indent ()) > -1; count++);
  if (n != -3) // (n == -2)
    {
      argile_die ("unexpected end of subcall that was not open");
      yyterminate ();
    }
  if (count)
    while (--count)
      argile_push_token (T_CODE_END);
  yy_pop_state ();
  return _push_pop (T_SUB_END);
}

<INITIAL>{CALL_END} {
  return _push_pop (T_CALL_END);
}

 /*
  * Syntax constant delimiters
  */
<INITIAL>{SYN_BEG} {
  yy_push_state (SC_SYN);
  return _push_pop (T_SYN_BEG);
}

<SC_SYN>{SYN_END} {
  yy_pop_state ();
  return T_SYN_END;
}

 /*
  * Code constant delimiters
  */
<INITIAL>{CODE_BEG} {
  argile_push_indent (-1);
  return _push_pop (T_CODE_BEG);
}

<INITIAL>{CODE_END} {
  int n, count;

  for (count = 0; (n = argile_pop_indent ()) > -1; count++);
  if (n == -2)
    {
      argile_die ("unexpected end of code block that was not open");
      yyterminate ();
    }
  if (count)
    while (--count)
      argile_push_token (T_CODE_END);
  return _push_pop (T_CODE_END);
}

 /*
  * Syntax parameter delimiters
  */
<SC_SYN,SC_ENM,SC_LST>{PAR_BEG} {
  yy_push_state (SC_PAR);
  return T_PAR_BEG;
}

<SC_PAR>{PAR_INIT} {
  return T_PAR_INIT;
}

<SC_PAR>{PAR_SEP} {
  return T_PAR_SEP;
}

<SC_PAR>{PAR_END} {
  yy_pop_state ();
  return T_PAR_END;
}

 /*
  * Syntax option delimiters
  */
<SC_SYN,SC_ENM,SC_LST>{OPT_BEG} {
  yy_push_state (SC_SYN);
  return T_OPT_BEG;
}

<SC_SYN>{OPT_END} {
  yy_pop_state ();
  return T_OPT_END;
}

 /*
  * Syntax enumeration delimiters
  */
<SC_SYN,SC_ENM,SC_LST>{ENM_BEG} {
  yy_push_state (SC_ENM);
  return T_ENM_BEG;
}

<SC_ENM>{ENM_SEP} {
  return T_ENM_SEP;
}

<SC_ENM>{ENM_END} {
  yy_pop_state ();
  return T_ENM_END;
}

 /*
  * Syntax list delimiters
  */
<SC_SYN,SC_ENM,SC_LST>{LST_BEG} {
  yy_push_state (SC_LST);
  return T_LST_BEG;
}

<SC_LST>{LST_SEP} {
  yy_push_state (SC_BND);
  return T_LST_SEP;
}

<SC_BND>{LST_CNT} {
  yylval.string = _yy_dup ();
  return T_INT_DEC;
}

<SC_BND>{LST_BND} {
  return T_LST_BND;
}

<SC_BND>{LST_END} {
  yy_pop_state ();
  yy_pop_state ();
  return T_LST_END;
}

<SC_LST>{LST_END} {
  yy_pop_state ();
  return T_LST_END;
}

 /*
  * Syntax operators
  */
<SC_SYN>{SYN_OPS} {
  _unescape ();
  yylval.string = _yy_dup ();
  return T_SYN_OP;
}

<SC_ENM>{ENM_OPS} {
  _unescape ();
  yylval.string = _yy_dup ();
  return T_SYN_OP;
}

<SC_LST>{LST_OPS} {
  _unescape ();
  yylval.string = _yy_dup ();
  return T_SYN_OP;
}

 /*
  * End of input
  */
<SC_STR><<EOF>> {
  if (_cons_str.ptr)
    argile_dbg_free (_cons_str.ptr);
  argile_die ("unexpected end of input, string constant not terminated");
  yyterminate ();
}

<SC_CMT><<EOF>> {
  if (_cmt_buf.ptr)
    argile_dbg_free (_cmt_buf.ptr);
  _cmt_nest = 0;
  argile_die ("unexpected end of input, comment not terminated");
  yyterminate ();
}

<SC_SYN,SC_PAR,SC_BND,SC_ENM,SC_LST><<EOF>> {
  argile_die ("unexpected end of input, syntax constant not terminated");
  yyterminate ();
}

<<EOF>> {
      int n;

      argile.input_ended = 1;
      for (n = 0; argile_top_indent () > -1; n++)
        argile_pop_indent ();
      if (n)
        while (--n)
          argile_push_token (T_CODE_END);
      if (argile.indents)
        {
          if (argile_top_indent () == -3)
	    argile_die ("unexpected end of input: a sub-call is still open");
          else
	    argile_die ("unexpected end of input: a code block is still open");
	  yyterminate ();
	}
      return _push_pop (T_END);
}

 /*
  * Default rule
  */
. {
  if (yytext[0] >= ' ' && yytext[0] <= '~')
    argile_die ("unexpected character '%c'", yytext[0]);
  else
    argile_die ("unexpected character 0x%.2x", (unsigned char)yytext[0]);
  yyterminate ();
}

%%

static void
_input (char *str, int len, YYLTYPE *_yylloc)
{
  int i, size, ch, warn, top;

  // if line begins by a non blank character (that is not '}')
  //   then it is a column 0 indentation
  if (argile.new_line && *str != '}'
      && *str != ' ' && *str != '\t'
      && *str != '\r' && *str != '\n')
    {
      top = argile_top_indent ();
      if (top == -3) // multiline subcall
	; // ignore
      else if (top < 0) // first indentation in explicit block
	argile_push_indent (0);
      else if (top > 0) // back indentation
	{
	  do
	    {
	      argile_pop_indent ();
	      argile_push_token (T_CODE_END);
	    } while (argile_top_indent () > 0);
	  argile_push_token (T_CALL_END);
	  argile_push_indent (0);
	}
      else // same indent
	argile_push_token (T_CALL_END);
    }
  argile.new_line = 0;
  argile.loc.start.col = argile.loc.end.col;
  argile.loc.start.row = argile.loc.end.row;
  for (i = 0; i < len; i += size)
    {
      warn = 0;
      if (argile.encoding)
        {
          size = argile.encoding->charsize (str + i, len - i);
	  if (size < 1) // not enough input bytes
	    {
	      size = 1;
	      warn = 1;
	    }
	}
      else
        size = 1;
      argile_buff_add (argile.linebuff, str + i, size);
      if (argile.encoding && !warn)
        ch = argile.encoding->charat (str + i, len - i);
      else
        ch = str[i];
      switch (ch)
	{
	case '\t':
	  argile.loc.end.col = (argile.loc.end.col / argile.colpertab + 1) *
	    argile.colpertab;
	  break;
	case '\n':
	  argile.loc.end.row++;
	  argile.loc.end.col = 0;
	  argile_buff_flush (argile.linebuff);
	  break;
	default:
	  argile.loc.end.col++;
	  break;
	}
    }
  _yylloc->first_column = argile.loc.start.col;
  _yylloc->first_line = argile.loc.start.row;
  _yylloc->last_column = argile.loc.end.col;
  _yylloc->last_line = argile.loc.end.row;
}

static void *
_memdup (void *ptr, int len)
{
  void *dup;

  dup = argile_dbg_malloc (len);
  memcpy (dup, ptr, len);
  argile_add_leak (dup);
  return dup;
}

static char *
_yy_dup ()
{
  return _memdup (yytext, yyleng + 1);
}

static void
_unescape ()
{
  int i, j;

  for (i = 0; i < yyleng; i++)
    if (yytext[i] == '\\')
      for (j = i; j < yyleng; j++)
	yytext[j] = yytext[j + 1];
}

static int
_push_pop (int token)
{
  argile_push_token (token);
  return argile_pop_token ();
}

void
_dummy ()
{
  yy_top_state(); // to avoid a warning, useless otherwise.
}
