/*
* Tokenizer.cs
*
* This work is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* This work is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
* As a special exception, the copyright holders of this library give
* you permission to link this library with independent modules to
* produce an executable, regardless of the license terms of these
* independent modules, and to copy and distribute the resulting
* executable under terms of your choice, provided that you also meet,
* for each linked independent module, the terms and conditions of the
* license of that module. An independent module is a module which is
* not derived from or based on this library. If you modify this
* library, you may extend this exception to your version of the
* library, but you are not obligated to do so. If you do not wish to
* do so, delete this exception statement from your version.
*
* Copyright (c) 2003 Per Cederberg. All rights reserved.
*/
using System.Collections;
using System.IO;
using System.Text;
using PerCederberg.Grammatica.Parser.RE;
namespace PerCederberg.Grammatica.Parser {
/**
* A character stream tokenizer. This class groups the characters read
* from the stream together into tokens ("words"). The grouping is
* controlled by token patterns that contain either a fixed string to
* search for, or a regular expression. If the stream of characters
* don't match any of the token patterns, a parse exception is thrown.
*
* @author Per Cederberg, <per at percederberg dot net>
* @version 1.4
*/
public class Tokenizer {
/**
* The token list feature flag.
*/
private bool useTokenList = false;
/**
* The string token matcher. This token matcher is used for all
* string token patterns. This matcher implements a DFA to
* provide maximum performance.
*/
private StringTokenMatcher stringMatcher = new StringTokenMatcher();
/**
* The list of all regular expression token matchers. These
* matchers each test matches for a single regular expression.
*/
private ArrayList regexpMatchers = new ArrayList();
/**
* The input stream to read from. When this is set to null, no
* further input is available.
*/
private TextReader input = null;
/**
* The buffer with previously read characters. Normally characters
* are appended in blocks to this buffer, and for every token that
* is found, its characters are removed from the buffer.
*/
private StringBuilder buffer = new StringBuilder();
/**
* The current position in the string buffer.
*/
private int position = 0;
/**
* The line number of the first character in the buffer. This
* value will be incremented when reading past line breaks.
*/
private int line = 1;
/**
* The column number of the first character in the buffer. This
* value will be updated for every character read.
*/
private int column = 1;
/**
* The end of buffer read flag. This flag is set if the end of
* the buffer was encountered while matching token patterns.
*/
private bool endOfBuffer = false;
/**
* The previous token in the token list.
*/
private Token previousToken = null;
/**
* Creates a new tokenizer for the specified input stream.
*
* @param input the input stream to read
*/
public Tokenizer(TextReader input) {
this.input = input;
}
/**
* Checks if the token list feature is used. The token list
* feature makes all tokens (including ignored tokens) link to
* each other in a linked list. By default the token list feature
* is not used.
*
* @return true if the token list feature is used, or
* false otherwise
*
* @see #setUseTokenList
* @see Token#getPreviousToken
* @see Token#getNextToken
*
* @since 1.4
*/
public bool GetUseTokenList() {
return useTokenList;
}
/**
* Sets the token list feature flag. The token list feature makes
* all tokens (including ignored tokens) link to each other in a
* linked list when active. By default the token list feature is
* not used.
*
* @param useTokenList the token list feature flag
*
* @see #getUseTokenList
* @see Token#getPreviousToken
* @see Token#getNextToken
*
* @since 1.4
*/
public void SetUseTokenList(bool useTokenList) {
this.useTokenList = useTokenList;
}
/**
* Returns a description of the token pattern with the
* specified id.
*
* @param id the token pattern id
*
* @return the token pattern description, or
* null if not present
*/
public string GetPatternDescription(int id) {
TokenPattern pattern;
RegExpTokenMatcher re;
pattern = stringMatcher.GetPattern(id);
if (pattern != null) {
return pattern.ToShortString();
}
for (int i = 0; i < regexpMatchers.Count; i++) {
re = (RegExpTokenMatcher) regexpMatchers[i];
if (re.GetPattern().GetId() == id) {
return re.GetPattern().ToShortString();
}
}
return null;
}
/**
* Returns the current line number. This number will be the line
* number of the next token returned.
*
* @return the current line number
*/
public int GetCurrentLine() {
return line;
}
/**
* Returns the current column number. This number will be the
* column number of the next token returned.
*
* @return the current column number
*/
public int GetCurrentColumn() {
return column;
}
/**
* Adds a new token pattern to the tokenizer. The pattern will be
* added last in the list, choosing a previous token pattern in
* case two matches the same string.
*
* @param pattern the pattern to add
*
* @throws ParserCreationException if the pattern couldn't be
* added to the tokenizer
*/
public void AddPattern(TokenPattern pattern) {
switch (pattern.GetPatternType()) {
case TokenPattern.PatternType.STRING:
stringMatcher.AddPattern(pattern);
break;
case TokenPattern.PatternType.REGEXP:
try {
regexpMatchers.Add(new RegExpTokenMatcher(pattern));
} catch (RegExpException e) {
throw new ParserCreationException(
ParserCreationException.ErrorType.INVALID_TOKEN,
pattern.GetName(),
"regular expression contains error(s): " +
e.Message);
}
break;
default:
throw new ParserCreationException(
ParserCreationException.ErrorType.INVALID_TOKEN,
pattern.GetName(),
"pattern type " + pattern.GetPatternType() +
" is undefined");
}
}
/**
* Finds the next token on the stream. This method will return
* null when end of file has been reached. It will return a
* parse exception if no token matched the input stream, or if
* a token pattern with the error flag set matched. Any tokens
* matching a token pattern with the ignore flag set will be
* silently ignored and the next token will be returned.
*
* @return the next token found, or
* null if end of file was encountered
*
* @throws ParseException if the input stream couldn't be read or
* parsed correctly
*/
public Token Next() {
Token token = null;
do {
token = NextToken();
if (useTokenList && token != null) {
token.SetPreviousToken(previousToken);
previousToken = token;
}
if (token == null) {
return null;
} else if (token.GetPattern().IsError()) {
throw new ParseException(
ParseException.ErrorType.INVALID_TOKEN,
token.GetPattern().GetErrorMessage(),
token.GetStartLine(),
token.GetStartColumn());
} else if (token.GetPattern().IsIgnore()) {
token = null;
}
} while (token == null);
return token;
}
/**
* Finds the next token on the stream. This method will return
* null when end of file has been reached. It will return a
* parse exception if no token matched the input stream.
*
* @return the next token found, or
* null if end of file was encountered
*
* @throws ParseException if the input stream couldn't be read or
* parsed correctly
*/
private Token NextToken() {
TokenMatcher m;
Token token;
string str;
ParseException e;
// Find longest matching string
do {
if (endOfBuffer) {
ReadInput();
endOfBuffer = false;
}
m = FindMatch();
} while (endOfBuffer && input != null);
// Return token results
if (m != null) {
str = buffer.ToString();
str = str.Substring(position, m.GetMatchedLength());
token = new Token(m.GetMatchedPattern(), str, line, column);
position += m.GetMatchedLength();
line = token.GetEndLine();
column = token.GetEndColumn() + 1;
return token;
} else if (position >= buffer.Length) {
return null;
} else {
e = new ParseException(
ParseException.ErrorType.UNEXPECTED_CHAR,
buffer[position].ToString(),
line,
column);
if (buffer[position] == '\n') {
line++;
column = 1;
} else {
column++;
}
position++;
throw e;
}
}
/**
* Reads characters from the input stream and appends them to
* the input buffer. This method is safe to call even though
* the end of file has been reached. As a side effect, this
* method may also remove
*
* @throws ParseException if an error was encountered while
* reading the input stream
*/
private void ReadInput() {
char[] chars = new char[4096];
int length;
// Check for end of file
if (input == null) {
return;
}
// Remove old characters from buffer
if (position > 1024) {
buffer.Remove(0, position);
position = 0;
}
// Read characters
try {
length = input.Read(chars, 0, chars.Length);
} catch (IOException e) {
input = null;
throw new ParseException(ParseException.ErrorType.IO,
e.Message,
-1,
-1);
}
// Append characters to buffer
if (length > 0) {
buffer.Append(chars, 0, length);
}
if (length < chars.Length) {
input.Close();
input = null;
}
}
/**
* Finds the longest token match from the current buffer
* position. This method will return the token matcher for the
* best match, or null if no match was found. As a side
* effect, this method will also set the end of buffer flag.
*
* @return the token mathcher with the longest match, or
* null if no match was found
*/
private TokenMatcher FindMatch() {
TokenMatcher bestMatch = null;
int bestLength = 0;
RegExpTokenMatcher re;
string str = buffer.ToString();
// Check string matches
if (stringMatcher.MatchFrom(str, position)) {
bestMatch = stringMatcher;
bestLength = bestMatch.GetMatchedLength();
}
if (stringMatcher.HasReadEndOfString()) {
endOfBuffer = true;
}
// Check regular expression matches
for (int i = 0; i < regexpMatchers.Count; i++) {
re = (RegExpTokenMatcher) regexpMatchers[i];
if (re.MatchFrom(str, position)
&& re.GetMatchedLength() > bestLength) {
bestMatch = re;
bestLength = bestMatch.GetMatchedLength();
}
if (re.HasReadEndOfString()) {
endOfBuffer = true;
}
}
return bestMatch;
}
/**
* Returns a string representation of this object. The returned
* string will contain the details of all the token patterns
* contained in this tokenizer.
*
* @return a detailed string representation
*/
public override string ToString() {
StringBuilder buffer = new StringBuilder();
buffer.Append(stringMatcher);
for (int i = 0; i < regexpMatchers.Count; i++) {
buffer.Append(regexpMatchers[i]);
}
return buffer.ToString();
}
}
/**
* A token pattern matcher. This class is the base class for the
* two types of token matchers that exist. The token matcher
* checks for matches with the tokenizer buffer, and maintains the
* state of the last match.
*/
internal abstract class TokenMatcher {
/**
* Returns the latest matched token pattern.
*
* @return the latest matched token pattern, or
* null if no match found
*/
public abstract TokenPattern GetMatchedPattern();
/**
* Returns the length of the latest match.
*
* @return the length of the latest match, or
* zero (0) if no match found
*/
public abstract int GetMatchedLength();
/**
* Checks if the end of string was encountered during the last
* match.
*
* @return true if the end of string was reached, or
* false otherwise
*/
public abstract bool HasReadEndOfString();
}
/**
* A regular expression token pattern matcher. This class is used
* to match a single regular expression with the tokenizer
* buffer. This class also maintains the state of the last match.
*/
internal class RegExpTokenMatcher : TokenMatcher {
/**
* The token pattern to match with.
*/
private TokenPattern pattern;
/**
* The regular expression to use.
*/
private RegExp regExp;
/**
* The regular expression matcher to use.
*/
private Matcher matcher = null;
/**
* Creates a new regular expression token matcher.
*
* @param pattern the pattern to match
*
* @throws RegExpException if the regular expression couldn't
* be created properly
*/
public RegExpTokenMatcher(TokenPattern pattern) {
this.pattern = pattern;
this.regExp = new RegExp(pattern.GetPattern());
}
/**
* Returns the token pattern.
*
* @return the token pattern
*/
public TokenPattern GetPattern() {
return pattern;
}
/**
* Returns the start position of the latest match.
*
* @return the start position of the last match, or
* zero (0) if none found
*/
public int Start() {
if (matcher == null || matcher.Length() <= 0) {
return 0;
} else {
return matcher.Start();
}
}
/**
* Returns the latest matched token pattern.
*
* @return the latest matched token pattern, or
* null if no match found
*/
public override TokenPattern GetMatchedPattern() {
if (matcher == null || matcher.Length() <= 0) {
return null;
} else {
return pattern;
}
}
/**
* Returns the length of the latest match.
*
* @return the length of the latest match, or
* zero (0) if no match found
*/
public override int GetMatchedLength() {
return (matcher == null) ? 0 : matcher.Length();
}
/**
* Checks if the end of string was encountered during the last
* match.
*
* @return true if the end of string was reached, or
* false otherwise
*/
public override bool HasReadEndOfString() {
return (matcher == null) ? false : matcher.HasReadEndOfString();
}
/**
* Checks if the token pattern matches the tokenizer buffer
* from the specified position. This method will also reset
* all flags in this matcher.
*
* @param str the string to match
* @param pos the starting position
*
* @return true if a match was found, or
* false otherwise
*/
public bool MatchFrom(string str, int pos) {
matcher = regExp.Matcher(str);
return matcher.MatchFrom(pos);
}
/**
* Returns a string representation of this token matcher.
*
* @return a detailed string representation of this matcher
*/
public override string ToString() {
return pattern.ToString() + "\n" +
regExp.ToString() + "\n";
}
}
/**
* A string token pattern matcher. This class is used to match a
* set of strings with the tokenizer buffer. This class
* internally uses a DFA for maximum performance. It also
* maintains the state of the last match.
*/
internal class StringTokenMatcher : TokenMatcher {
/**
* The list of string token patterns.
*/
private ArrayList patterns = new ArrayList();
/**
* The finite automaton to use for matching.
*/
private Automaton start = new Automaton();
/**
* The last token pattern match found.
*/
private TokenPattern match = null;
/**
* The end of string read flag.
*/
private bool endOfString = false;
/**
* Creates a new string token matcher.
*/
public StringTokenMatcher() {
}
/**
* Resets the matcher state. This will clear the results of
* the last match.
*/
public void Reset() {
match = null;
endOfString = false;
}
/**
* Returns the latest matched token pattern.
*
* @return the latest matched token pattern, or
* null if no match found
*/
public override TokenPattern GetMatchedPattern() {
return match;
}
/**
* Returns the length of the latest match.
*
* @return the length of the latest match, or
* zero (0) if no match found
*/
public override int GetMatchedLength() {
if (match == null) {
return 0;
} else {
return match.GetPattern().Length;
}
}
/**
* Checks if the end of string was encountered during the last
* match.
*
* @return true if the end of string was reached, or
* false otherwise
*/
public override bool HasReadEndOfString() {
return endOfString;
}
/**
* Sets the end of string encountered flag.
*/
public void SetReadEndOfString() {
endOfString = true;
}
/**
* Returns the token pattern with the specified id. Only
* token patterns handled by this matcher can be returned.
*
* @param id the token pattern id
*
* @return the token pattern found, or
* null if not found
*/
public TokenPattern GetPattern(int id) {
TokenPattern pattern;
for (int i = 0; i < patterns.Count; i++) {
pattern = (TokenPattern) patterns[i];
if (pattern.GetId() == id) {
return pattern;
}
}
return null;
}
/**
* Adds a string token pattern to this matcher.
*
* @param pattern the pattern to add
*/
public void AddPattern(TokenPattern pattern) {
patterns.Add(pattern);
start.AddMatch(pattern.GetPattern(), pattern);
}
/**
* Checks if the token pattern matches the tokenizer buffer
* from the specified position. This method will also reset
* all flags in this matcher.
*
* @param str the string to match
* @param pos the starting position
*
* @return true if a match was found, or
* false otherwise
*/
public bool MatchFrom(string str, int pos) {
Reset();
match = (TokenPattern) start.MatchFrom(this, str, pos);
return match != null;
}
/**
* Returns a string representation of this matcher. This will
* contain all the token patterns.
*
* @return a detailed string representation of this matcher
*/
public override string ToString() {
StringBuilder buffer = new StringBuilder();
for (int i = 0; i < patterns.Count; i++) {
buffer.Append(patterns[i]);
buffer.Append("\n\n");
}
return buffer.ToString();
}
}
/**
* A deterministic finite automaton. This is a simple automaton
* for character sequences. It cannot handle character set state
* transitions, but only supports single character transitions.
*/
internal class Automaton {
/**
* The state value.
*/
private object value = null;
/**
* The automaton state transition tree. Each transition from
* this state to another state is added to this tree with the
* corresponding character.
*/
private AutomatonTree tree = new AutomatonTree();
/**
* Creates a new empty automaton.
*/
public Automaton() {
}
/**
* Adds a string match to this automaton. New states and
* transitions will be added to extend this automaton to
* support the specified string.
*
* @param str the string to match
* @param value the match value
*/
public void AddMatch(string str, object value) {
Automaton state;
if (str.Equals("")) {
this.value = value;
} else {
state = tree.Find(str[0]);
if (state == null) {
state = new Automaton();
state.AddMatch(str.Substring(1), value);
tree.Add(str[0], state);
} else {
state.AddMatch(str.Substring(1), value);
}
}
}
/**
* Checks if the automaton matches the tokenizer buffer from
* the specified position. This method will set the end of
* buffer flag in the specified token matcher if the end of
* buffer is reached.
*
* @param m the string token matcher
* @param str the string to match
* @param pos the starting position
*
* @return the match value, or
* null if no match is found
*/
public object MatchFrom(StringTokenMatcher m, string str, int pos) {
object result = null;
Automaton state;
if (pos >= str.Length) {
m.SetReadEndOfString();
} else if (tree != null) {
state = tree.Find(str[pos]);
if (state != null) {
result = state.MatchFrom(m, str, pos + 1);
}
}
return (result == null) ? value : result;
}
}
/**
* An automaton state transition tree. This class contains a
* binary search tree for the automaton transitions from one
* state to another. All transitions are linked to a single
* character.
*/
internal class AutomatonTree {
/**
* The transition character. If this value is set to the zero
* ('\0') character, this tree is empty.
*/
private char value = '\0';
/**
* The transition state.
*/
private Automaton state = null;
/**
* The left subtree.
*/
private AutomatonTree left = null;
/**
* The right subtree.
*/
private AutomatonTree right = null;
/**
* Creates a new empty automaton transition tree.
*/
public AutomatonTree() {
}
/**
* Finds an automaton state from the specified transition
* character. This method searches this transition tree for
* a matching transition.
*
* @param c the character to search for
*
* @return the automaton state found, or
* null if no transition exists
*/
public Automaton Find(char c) {
if (value == '\0' || value == c) {
return state;
} else if (value > c) {
return left.Find(c);
} else {
return right.Find(c);
}
}
/**
* Adds a transition to this tree.
*
* @param c the character to transition for
* @param state the state to transition to
*/
public void Add(char c, Automaton state) {
if (value == '\0') {
this.value = c;
this.state = state;
this.left = new AutomatonTree();
this.right = new AutomatonTree();
} else if (value > c) {
left.Add(c, state);
} else {
right.Add(c, state);
}
}
}
}