/*
 * Tokenizer.java
 *
 * This work is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * This work is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software 
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 *
 * As a special exception, the copyright holders of this library give
 * you permission to link this library with independent modules to
 * produce an executable, regardless of the license terms of these
 * independent modules, and to copy and distribute the resulting
 * executable under terms of your choice, provided that you also meet,
 * for each linked independent module, the terms and conditions of the
 * license of that module. An independent module is a module which is
 * not derived from or based on this library. If you modify this
 * library, you may extend this exception to your version of the
 * library, but you are not obligated to do so. If you do not wish to
 * do so, delete this exception statement from your version.
 *
 * Copyright (c) 2003 Per Cederberg. All rights reserved.
 */

package net.percederberg.grammatica.parser;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;

import net.percederberg.grammatica.parser.re.CharBuffer;
import net.percederberg.grammatica.parser.re.RegExp;
import net.percederberg.grammatica.parser.re.Matcher;
import net.percederberg.grammatica.parser.re.RegExpException;

/**
 * A character stream tokenizer. This class groups the characters read 
 * from the stream together into tokens ("words"). The grouping is
 * controlled by token patterns that contain either a fixed string to
 * search for, or a regular expression. If the stream of characters 
 * don't match any of the token patterns, a parse exception is thrown. 
 *
 * @author   Per Cederberg, <per at percederberg dot net>
 * @version  1.0
 */
public class Tokenizer {
    
    /**
     * The list of all token pattern matchers.
     */
    private ArrayList matchers = new ArrayList();

    /**
     * The input stream to read from. When this is set to null, no
     * further input is available.
     */
    private Reader input = null;

    /**
     * The buffer with previously read characters. Normally characters
     * are appended in blocks to this buffer, and for every token that
     * is found, its characters are removed from the buffer.
     */
    private CharBuffer buffer = new CharBuffer();

    /**
     * The current position in the string buffer.
     */
    private int position = 0;

    /**
     * The line number of the first character in the buffer. This 
     * value will be incremented when reading past line breaks. 
     */
    private int line = 1;

    /**
     * The column number of the first character in the buffer. This 
     * value will be updated for every character read. 
     */    
    private int column = 1;
    
    /**
     * The end of buffer read flag. This flag is set if the end of
     * the buffer was encountered while matching token patterns.
     */
    private boolean endOfBuffer = false;

    /**
     * Creates a new tokenizer for the specified input stream.
     * 
     * @param input          the input stream to read
     */
    public Tokenizer(Reader input) {
        this.input = input;
    }
    
    /**
     * Returns a description of the token pattern with the specified 
     * id.
     * 
     * @param id             the token pattern id
     * 
     * @return the token pattern description, or
     *         null if not present
     */
    public String getPatternDescription(int id) {
        TokenMatcher  m;
        
        for (int i = 0; i < matchers.size(); i++) {
            m = (TokenMatcher) matchers.get(i);
            if (m.getPattern().getId() == id) {
                return m.getPattern().toShortString();
            }
        }
        return null;
    }

    /**
     * Returns the current line number. This number will be the line
     * number of the next token returned.
     * 
     * @return the current line number
     */
    public int getCurrentLine() {
        return line;
    }
    
    /**
     * Returns the current column number. This number will be the 
     * column number of the next token returned.
     * 
     * @return the current column number
     */
    public int getCurrentColumn() {
        return column;
    }

    /**
     * Adds a new token pattern to the tokenizer. The pattern will be
     * added last in the list, choosing a previous token pattern in 
     * case two matches the same string.
     * 
     * @param pattern        the pattern to add
     * 
     * @throws ParserCreationException if the pattern couldn't be 
     *             added to the tokenizer
     */
    public void addPattern(TokenPattern pattern) 
        throws ParserCreationException {

        switch (pattern.getType()) {
        case TokenPattern.STRING_TYPE:
            matchers.add(new StringTokenMatcher(pattern));
            break;
        case TokenPattern.REGEXP_TYPE:
            try {
                matchers.add(new RegExpTokenMatcher(pattern));
            } catch (RegExpException e) {
                throw new ParserCreationException(
                    ParserCreationException.INVALID_TOKEN_ERROR,
                    pattern.getName(),
                    "regular expression contains error(s): " + 
                    e.getMessage());
            }
            break;
        default:
            throw new ParserCreationException(
                ParserCreationException.INVALID_TOKEN_ERROR,
                pattern.getName(),
                "pattern type " + pattern.getType() + " is undefined");
        }
    }

    /**
     * Finds the next token on the stream. This method will return 
     * null when end of file has been reached. It will return a parse
     * exception if no token matched the input stream, or if a token
     * pattern with the error flag set matched. Any tokens matching a
     * token pattern with the ignore flag set will be silently ignored 
     * and the next token will be returned.
     * 
     * @return the next token found, or 
     *         null if end of file was encountered
     *
     * @throws ParseException if the input stream couldn't be read or
     *             parsed correctly
     */
    public Token next() throws ParseException {
        Token  token = null;
        
        do {
            token = nextToken();
            if (token == null) {
                return null;
            } else if (token.getPattern().isError()) {
                throw new ParseException(
                    ParseException.INVALID_TOKEN_ERROR,
                    token.getPattern().getErrorMessage(),
                    token.getStartLine(),
                    token.getStartColumn());
            } else if (token.getPattern().isIgnore()) {
                token = null;
            }
        } while (token == null);

        return token;
    }

    /**
     * Finds the next token on the stream. This method will return 
     * null when end of file has been reached. It will return a parse
     * exception if no token matched the input stream.
     * 
     * @return the next token found, or 
     *         null if end of file was encountered
     *
     * @throws ParseException if the input stream couldn't be read or
     *             parsed correctly
     */
    private Token nextToken() throws ParseException {
        TokenMatcher  m;
        Token         token;
        String        str;

        // Find longest matching string 
        do {
            if (endOfBuffer) {
                readInput();
                endOfBuffer = false;
            }
            m = findMatch();
        } while (endOfBuffer && input != null);

        // Return token results
        if (m != null) {
            str = buffer.substring(position, position + m.length());
            token = new Token(m.getPattern(), str, line, column); 
            position += m.length();
            line = token.getEndLine();
            column = token.getEndColumn() + 1;
            return token;
        } else if (position >= buffer.length()) {
            return null;
        } else {
            throw new ParseException(
                ParseException.UNEXPECTED_CHAR_ERROR,
                String.valueOf(buffer.charAt(position)),
                line,
                column);
        }
    }
    
    /**
     * Reads characters from the input stream and appends them to the 
     * input buffer. This method is safe to call even though the end
     * of file has been reached. As a side effect, this method may 
     * also remove 
     * 
     * @throws ParseException if an error was encountered while 
     *             reading the input stream
     */
    private void readInput() throws ParseException {
        char  chars[] = new char[4096];
        int   length;

        // Check for end of file
        if (input == null) {
            return;
        }

        // Remove old characters from buffer
        if (position > 1024) {
            buffer.delete(0, position);
            position = 0;
        }

        // Read characters
        try {
            length = input.read(chars);
        } catch (IOException e) {
            input = null;
            throw new ParseException(ParseException.IO_ERROR,
                                     e.getMessage(),
                                     -1,
                                     -1);
        }

        // Append characters to buffer
        if (length > 0) {
            buffer.append(chars, 0, length);
        }
        if (length < chars.length) {
            try {
                input.close();
            } catch (IOException e) {
                // Do nothing
            }
            input = null;
        }
    }
    
    /**
     * Finds the longest token match from the current buffer position. 
     * This method will return the token matcher for the best match,
     * or null if no match was found. As a side effect, this method
     * will also set the end of buffer flag. 
     *  
     * @return the token mathcher with the longest match, or
     *         null if no match was found
     */
    private TokenMatcher findMatch() {
        TokenMatcher  bestMatch = null;
        TokenMatcher  m;

        for (int i = 0; i < matchers.size(); i++) {
            m = (TokenMatcher) matchers.get(i);
            if (m.matchFrom(position)
             && (bestMatch == null || m.length() > bestMatch.length())) {

                bestMatch = m;
            }
            if (m.hasReadEndOfString()) {
                endOfBuffer = true;
            }
        }
        return bestMatch;
    }
    
    
    /**
     * A token pattern matcher. This class is the base class for the
     * two types of token matchers that exist. The token matcher 
     * checks for matches with the tokenizer buffer, and maintains the
     * state of the last match. 
     */
    private abstract class TokenMatcher {

        /**
         * The token pattern to match with.
         */
        private TokenPattern pattern;
        
        /**
         * Creates a new token pattern matcher.
         * 
         * @param pattern        the pattern to match with
         */
        public TokenMatcher(TokenPattern pattern) {
            this.pattern = pattern;
        }
        
        /**
         * Returns the token pattern.
         * 
         * @return the token pattern
         */
        public TokenPattern getPattern() {
            return pattern;
        }

        /**
         * Returns the start position of the latest match.
         * 
         * @return the start position of the last match, or
         *         zero (0) if none found
         */
        public abstract int start();

        /**
         * Returns the length of the latest match.
         * 
         * @return the length of the last match, or
         *         zero (0) if none found
         */        
        public abstract int length();
        
        /**
         * Checks if the end of string was encountered during the last
         * match. 
         * 
         * @return true if the end of string was reached, or
         *         false otherwise
         */
        public abstract boolean hasReadEndOfString(); 

        /**
         * Checks if the token pattern matches the tokenizer buffer  
         * from the specified position. This method will also reset 
         * all flags in this matcher.
         * 
         * @param pos            the starting position
         * 
         * @return true if a match was found, or
         *         false otherwise
         */
        public abstract boolean matchFrom(int pos);
    }
    

    /**
     * A string token pattern matcher. This class is used to check 
     * for matches with the tokenizer buffer, and maintain the state 
     * of the last match. 
     */
    private class StringTokenMatcher extends TokenMatcher {

        /**
         * The string image to search for.
         */
        private String image;
        
        /**
         * The start position of the last match. This will be a 
         * negative value if no match was found.
         */
        private int start;

        /**
         * The end of string read flag.
         */
        private boolean endOfString;

        /**
         * Creates a new string token matcher.
         * 
         * @param pattern        the token pattern to use
         */
        public StringTokenMatcher(TokenPattern pattern) {
            super(pattern);
            image = pattern.getPattern();
            reset();
        }

        /**
         * Resets the matcher state. This will clear the results of 
         * the last match. 
         */
        public void reset() {
            start = -1;
            endOfString = false;
        }

        /**
         * Returns the start position of the latest match.
         * 
         * @return the start position of the last match, or
         *         zero (0) if none found
         */
        public int start() {
            return (start < 0) ? 0 : start;
        }
        
        /**
         * Returns the length of the latest match.
         * 
         * @return the length of the last match, or
         *         zero (0) if none found
         */        
        public int length() {
            return (start < 0) ? 0 : image.length();
        }
        
        /**
         * Checks if the end of string was encountered during the last
         * match. 
         * 
         * @return true if the end of string was reached, or
         *         false otherwise
         */
        public boolean hasReadEndOfString() {
            return endOfString; 
        }

        /**
         * Checks if the token pattern matches the tokenizer buffer  
         * from the specified position. This method will also reset 
         * all flags in this matcher.
         * 
         * @param pos            the starting position
         * 
         * @return true if a match was found, or
         *         false otherwise
         */
        public boolean matchFrom(int pos) {
            reset();
            if (pos + image.length() > buffer.length()) {
                endOfString = true;
                return false;
            }
            for (int i = 0; i < image.length(); i++) {
                if (image.charAt(i) != buffer.charAt(pos + i)) {
                    return false;
                }
            }
            start = pos;
            return true;
        }
    }
    

    /**
     * A regular expression token pattern matcher. This class is used
     * to check for matches with the tokenizer buffer, and maintain 
     * the state of the last match. 
     */
    private class RegExpTokenMatcher extends TokenMatcher {

        /**
         * The regular expression matcher to use.
         */
        private Matcher matcher;

        /**
         * Creates a new regular expression token matcher.
         * 
         * @param pattern        the pattern to match
         * 
         * @throws RegExpException if the regular expression couldn't
         *             be created properly
         */
        public RegExpTokenMatcher(TokenPattern pattern)
            throws RegExpException {

            super(pattern);
            RegExp regExp = new RegExp(pattern.getPattern());
            matcher = regExp.matcher(buffer);
        }

        /**
         * Returns the start position of the latest match.
         * 
         * @return the start position of the last match, or
         *         zero (0) if none found
         */
        public int start() {
            if (matcher.length() <= 0) {
                return 0;
            } else {
                return matcher.start();
            }
        }
        
        /**
         * Returns the length of the latest match.
         * 
         * @return the length of the last match, or
         *         zero (0) if none found
         */        
        public int length() {
            return matcher.length();
        }
        
        /**
         * Checks if the end of string was encountered during the last
         * match. 
         * 
         * @return true if the end of string was reached, or
         *         false otherwise
         */
        public boolean hasReadEndOfString() {
            return matcher.hasReadEndOfString();
        }

        /**
         * Checks if the token pattern matches the tokenizer buffer  
         * from the specified position. This method will also reset 
         * all flags in this matcher.
         * 
         * @param pos            the starting position
         * 
         * @return true if a match was found, or
         *         false otherwise
         */
        public boolean matchFrom(int pos) {
            return matcher.matchFrom(pos);
        }
    }
}
