/*
* CharacterSetElement.cs
*
* This work is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* This work is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
* As a special exception, the copyright holders of this library give
* you permission to link this library with independent modules to
* produce an executable, regardless of the license terms of these
* independent modules, and to copy and distribute the resulting
* executable under terms of your choice, provided that you also meet,
* for each linked independent module, the terms and conditions of the
* license of that module. An independent module is a module which is
* not derived from or based on this library. If you modify this
* library, you may extend this exception to your version of the
* library, but you are not obligated to do so. If you do not wish to
* do so, delete this exception statement from your version.
*
* Copyright (c) 2003 Per Cederberg. All rights reserved.
*/
using System.Collections;
using System.IO;
using System.Text;
namespace PerCederberg.Grammatica.Parser.RE {
/**
* A regular expression character set element. This element
* matches a single character inside (or outside) a character set.
* The character set is user defined and may contain ranges of
* characters. The set may also be inverted, meaning that only
* characters not inside the set will be considered to match.
*
* @author Per Cederberg, <per at percederberg dot net>
* @version 1.0
*/
internal class CharacterSetElement : Element {
/**
* The dot ('.') character set. This element matches a single
* character that is not equal to a newline character.
*/
public static CharacterSetElement DOT =
new CharacterSetElement(false);
/**
* The digit character set. This element matches a single
* numeric character.
*/
public static CharacterSetElement DIGIT =
new CharacterSetElement(false);
/**
* The non-digit character set. This element matches a single
* non-numeric character.
*/
public static CharacterSetElement NON_DIGIT =
new CharacterSetElement(true);
/**
* The whitespace character set. This element matches a single
* whitespace character.
*/
public static CharacterSetElement WHITESPACE =
new CharacterSetElement(false);
/**
* The non-whitespace character set. This element matches a
* single non-whitespace character.
*/
public static CharacterSetElement NON_WHITESPACE =
new CharacterSetElement(true);
/**
* The word character set. This element matches a single word
* character.
*/
public static CharacterSetElement WORD =
new CharacterSetElement(false);
/**
* The non-word character set. This element matches a single
* non-word character.
*/
public static CharacterSetElement NON_WORD =
new CharacterSetElement(true);
/**
* The inverted character set flag.
*/
private bool inverted;
/**
* The character set content. This array may contain either
* range objects or Character objects.
*/
private ArrayList contents = new ArrayList();
/**
* Creates a new character set element. If the inverted character
* set flag is set, only characters NOT in the set will match.
*
* @param inverted the inverted character set flag
*/
public CharacterSetElement(bool inverted) {
this.inverted = inverted;
}
/**
* Adds a single character to this character set.
*
* @param c the character to add
*/
public void AddCharacter(char c) {
contents.Add(c);
}
/**
* Adds multiple characters to this character set.
*
* @param str the string with characters to add
*/
public void AddCharacters(string str) {
for (int i = 0; i < str.Length; i++) {
AddCharacter(str[i]);
}
}
/**
* Adds multiple characters to this character set.
*
* @param elem the string element with characters to add
*/
public void AddCharacters(StringElement elem) {
AddCharacters(elem.GetString());
}
/**
* Adds a character range to this character set.
*
* @param min the minimum character value
* @param max the maximum character value
*/
public void AddRange(char min, char max) {
contents.Add(new Range(min, max));
}
/**
* Adds a character subset to this character set.
*
* @param elem the character set to add
*/
public void AddCharacterSet(CharacterSetElement elem) {
contents.Add(elem);
}
/**
* Returns this element as the character set shouldn't be
* modified after creation. This partially breaks the contract
* of clone(), but as new characters are not added to the
* character set after creation, this will work correctly.
*
* @return this character set element
*/
public override object Clone() {
return this;
}
/**
* Returns the length of a matching string starting at the
* specified position. The number of matches to skip can also be
* specified, but numbers higher than zero (0) cause a failed
* match for any element that doesn't attempt to combine other
* elements.
*
* @param m the matcher being used
* @param str the string to match
* @param start the starting position
* @param skip the number of matches to skip
*
* @return the length of the matching string, or
* -1 if no match was found
*/
public override int Match(Matcher m,
string str,
int start,
int skip) {
char c;
if (skip != 0) {
return -1;
}
if (start >= str.Length) {
m.SetReadEndOfString();
return -1;
}
c = str[start];
return InSet(c) ? 1 : -1;
}
/**
* Checks if the specified character matches this character
* set. This method takes the inverted flag into account.
*
* @param c the character to check
*
* @return true if the character matches, or
* false otherwise
*/
private bool InSet(char c) {
if (this == DOT) {
return InDotSet(c);
} else if (this == DIGIT || this == NON_DIGIT) {
return InDigitSet(c) != inverted;
} else if (this == WHITESPACE || this == NON_WHITESPACE) {
return InWhitespaceSet(c) != inverted;
} else if (this == WORD || this == NON_WORD) {
return InWordSet(c) != inverted;
} else {
return InUserSet(c) != inverted;
}
}
/**
* Checks if the specified character is present in the 'dot'
* set. This method does not consider the inverted flag.
*
* @param c the character to check
*
* @return true if the character is present, or
* false otherwise
*/
private bool InDotSet(char c) {
switch (c) {
case '\n':
case '\r':
case '\u0085':
case '\u2028':
case '\u2029':
return false;
default:
return true;
}
}
/**
* Checks if the specified character is a digit. This method
* does not consider the inverted flag.
*
* @param c the character to check
*
* @return true if the character is a digit, or
* false otherwise
*/
private bool InDigitSet(char c) {
return '0' <= c && c <= '9';
}
/**
* Checks if the specified character is a whitespace
* character. This method does not consider the inverted flag.
*
* @param c the character to check
*
* @return true if the character is a whitespace character, or
* false otherwise
*/
private bool InWhitespaceSet(char c) {
switch (c) {
case ' ':
case '\t':
case '\n':
case '\f':
case '\r':
case 11:
return true;
default:
return false;
}
}
/**
* Checks if the specified character is a word character. This
* method does not consider the inverted flag.
*
* @param c the character to check
*
* @return true if the character is a word character, or
* false otherwise
*/
private bool InWordSet(char c) {
return ('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| ('0' <= c && c <= '9')
|| c == '_';
}
/**
* Checks if the specified character is present in the user-
* defined set. This method does not consider the inverted
* flag.
*
* @param value the character to check
*
* @return true if the character is present, or
* false otherwise
*/
private bool InUserSet(char value) {
object obj;
char c;
Range r;
CharacterSetElement e;
for (int i = 0; i < contents.Count; i++) {
obj = contents[i];
if (obj is char) {
c = (char) obj;
if (c == value) {
return true;
}
} else if (obj is Range) {
r = (Range) obj;
if (r.Inside(value)) {
return true;
}
} else if (obj is CharacterSetElement) {
e = (CharacterSetElement) obj;
if (e.InSet(value)) {
return true;
}
}
}
return false;
}
/**
* Prints this element to the specified output stream.
*
* @param output the output stream to use
* @param indent the current indentation
*/
public override void PrintTo(TextWriter output, string indent) {
output.WriteLine(indent + ToString());
}
/**
* Returns a string description of this character set.
*
* @return a string description of this character set
*/
public override string ToString() {
StringBuilder buffer;
// Handle predefined character sets
if (this == DOT) {
return ".";
} else if (this == DIGIT) {
return "\\d";
} else if (this == NON_DIGIT) {
return "\\D";
} else if (this == WHITESPACE) {
return "\\s";
} else if (this == NON_WHITESPACE) {
return "\\S";
} else if (this == WORD) {
return "\\w";
} else if (this == NON_WORD) {
return "\\W";
}
// Handle user-defined character sets
buffer = new StringBuilder();
if (inverted) {
buffer.Append("^[");
} else {
buffer.Append("[");
}
for (int i = 0; i < contents.Count; i++) {
buffer.Append(contents[i]);
}
buffer.Append("]");
return buffer.ToString();
}
/**
* A character range class.
*/
private class Range {
/**
* The minimum character value.
*/
private char min;
/**
* The maximum character value.
*/
private char max;
/**
* Creates a new character range.
*
* @param min the minimum character value
* @param max the maximum character value
*/
public Range(char min, char max) {
this.min = min;
this.max = max;
}
/**
* Checks if the specified character is inside the range.
*
* @param c the character to check
*
* @return true if the character is in the range, or
* false otherwise
*/
public bool Inside(char c) {
return min <= c && c <= max;
}
/**
* Returns a string representation of this object.
*
* @return a string representation of this object
*/
public override string ToString() {
return min + "-" + max;
}
}
}
}