/*
SDX: Documentary System in XML.
Copyright (C) 2000, 2001, 2002  Ministere de la culture et de la communication (France), AJLSM

Ministere de la culture et de la communication,
Mission de la recherche et de la technologie
3 rue de Valois, 75042 Paris Cedex 01 (France)
mrt@culture.fr, michel.bottin@culture.fr

AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
sevigny@ajlsm.com

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the
Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
or connect to:
http://www.fsf.org/copyleft/gpl.html
*/
/*
 * Created by IntelliJ IDEA.
 * User: rpandey
 * Date: 5 nov. 2002
 * Time: 11:39:14
 * To change template for new class use
 * Code Style | Class Templates options (Tools | IDE Options).
 */
package fr.gouv.culture.sdx.search.lucene.analysis.filter;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;

import java.util.Hashtable;

/**
 * Title: ChineseFilter
 * Description: Filter with a stop word table
 *              Rule: No digital is allowed.
 *                    English word/token should larger than 1 character.
 *                    One Chinese character as one Chinese word.
 * TO DO:
 *   1. Add Chinese stop words, such as \ue400
 *   2. Dictionary based Chinese word extraction
 *   3. Intelligent Chinese word extraction
 *
 * Copyright:    Copyright (c) 2001
 * Company:
 * @author Yiyi Sun
 * @version 1.0
 *
 */

public final class ChineseFilter extends TokenFilter {


    // Only English now, Chinese to be added later.
    public static final String[] STOP_WORDS = {
        "and", "are", "as", "at", "be", "but", "by",
        "for", "if", "in", "into", "is", "it",
        "no", "not", "of", "on", "or", "such",
        "that", "the", "their", "then", "there", "these",
        "they", "this", "to", "was", "will", "with"
    };


    private Hashtable stopTable;

    public ChineseFilter(TokenStream in) {
        input = in;

        stopTable = new Hashtable(STOP_WORDS.length);
        for (int i = 0; i < STOP_WORDS.length; i++)
            stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
    }

    public ChineseFilter(TokenStream in, Hashtable stopWords) {
        input = in;
        stopTable = stopWords;
        if (stopTable == null)
            stopTable = new Hashtable();
    }

    public final Token next() throws java.io.IOException {

        for (Token token = input.next(); token != null; token = input.next()) {
            String text = token.termText();

            if (stopTable.get(text) == null) {
                switch (Character.getType(text.charAt(0))) {

                    case Character.LOWERCASE_LETTER:
                    case Character.UPPERCASE_LETTER:

                        // English word/token should larger than 1 character.
                        if (text.length() > 1) {
                            return token;
                        }
                        break;
                    case Character.OTHER_LETTER:

                        // One Chinese character as one Chinese word.
                        // Chinese word extraction to be added later here.

                        return token;
                }

            }

        }
        return null;
    }

}
