Regex parser which follows the following rule
.
(dot) means a any single character match*
(star) means 0 or more character match?
(question) means previous char is an option i.ecolou?r
matchescolor
andcolour
.
A large feedback has been taken from here.
I'm looking for code review, optimizations, best practices. I also still get confused with complexity. Let's say regex string has length of m
and string has length of n
. Then it appears the complexity would be O(n! * (n-1)! * ... (n-m)!)
public final class Regex {
private Regex() {};
private static enum Wildcards {
STAR('*'), DOT('.'), QUESTION('?');
private final char wildCard;
Wildcards (char wildCard) {
this.wildCard = wildCard;
}
public char getWildCard() {
return this.wildCard;
}
}
/**
* The index of first non-star character following first occurence of star
* eg: if regex is a**b, then index is startIndex is 1 and return value is 3 (index of b)
*
* @param regex : the regex string.
* @param firstStarIndex : index of the first occurene of * after a non *, eg: a**b, then starIndex would be 1
* @return : the index of first non-star character following first occurence of star
*/
private static final int getNonWildCardIndex(String regex, int firstQuestionIndex, Wildcards foo) {
for (int k = firstQuestionIndex; k < regex.length(); k++) {
if (regex.charAt(k) != foo.getWildCard()) {
return k;
}
}
return regex.length();
}
/**
* A valid wildcard tail is a tail of ?' s and *'s.
* Its a tail which matches 0 characters
*
* @param regex The input regex
* @param index The index from which the wild card check must be initiated.
* @return true is the resulting wildcard matches empty string.
*/
private static final boolean validWildCardTail(String regex, int index) {
assert regex != null;
for (int i = index; i < regex.length(); i++) {
if (regex.charAt(i) != Wildcards.QUESTION.getWildCard() &&
regex.charAt(i) != Wildcards.STAR.getWildCard()) {
return false;
}
}
return true;
}
/**
* Checks if we have scanned the entire string for regex match.
* If not, it means some characters from strPos in the input string do not match regex.
*
* @param str The input string.
* @param strPos The current string index
* @return true if entire string has been scanned for regex match.
*/
private static final boolean endOfStringCheck(String str, int strPos) {
return str.length() == strPos;
}
/**
* Checks if the regex matches matches the empty string. At this point input string has been traversed.
* Only the regex needs to be travelled to.
*
* @param regex The input regex
* @param regexPos The current regex index.
* @return true if regex matches the input string
*/
private static final boolean endOfRegexCheck (String regex, int regexPos) {
if (regexPos == regex.length()) return true;
// example: regex: "colou?", string: "colou"
if (regex.charAt(regexPos) == Wildcards.QUESTION.getWildCard() ||
// example: regex: "colou?", string: "colo"
(regexPos < (regex.length() - 1)) && regex.charAt(regexPos + 1) == Wildcards.QUESTION.getWildCard() ||
// example: regex: "colo*", string: "colour"
regex.charAt(regexPos) == Wildcards.STAR.getWildCard()) {
// regex : colou?**.***, string : colou
// check the `regexPos + 1 argument`, its very sensible. regixPos is already checked. thus we start check from the next position.
return validWildCardTail(regex, regexPos + 1);
} else {
return false;
}
}
private static final boolean matcher(String regex, int regexPos, String str, int strPos) {
assert regexPos >= 0 && regexPos <= regex.length();
assert strPos >= 0 && strPos <= str.length(); // it should break when its == strPos, > strpos is an impossible condition.
while ((regexPos < regex.length() && strPos < str.length())) {
if (regex.charAt(regexPos) == str.charAt(strPos) || regex.charAt(regexPos) == Wildcards.DOT.getWildCard()) {
regexPos++;
strPos++;
}
/*
* nested if else is better than non-nested.
* http://www.macs.hw.ac.uk/~pjbk/pathways/cpp1/node99.html
*/
else if (regex.charAt(regexPos) == Wildcards.STAR.getWildCard()) {
int nextNonStarIndex = getNonWildCardIndex(regex, regexPos, Wildcards.STAR);
/*
* For a case where regex and string, both have not yet reached the last char but regex is all *'s till end.
* Eg: Regext abc*** && String is abcxyz
*/
if (nextNonStarIndex == regex.length()) return true;
while (strPos < str.length()) {
if (matcher(regex, nextNonStarIndex, str, strPos)) return true;
strPos = strPos + 1;
}
return false;
} else {
// regex : colou?r
// string : colour
if (regex.charAt(regexPos) == Wildcards.QUESTION.getWildCard()) {
return matcher(regex, getNonWildCardIndex(regex, regexPos, Wildcards.QUESTION), str, strPos);
}
// regex: colou?r
// regex: color
if (regexPos < (regex.length() - 1) && regex.charAt(regexPos + 1) == Wildcards.QUESTION.getWildCard()) {
return matcher(regex, getNonWildCardIndex(regex, regexPos + 1, Wildcards.QUESTION), str, strPos);
}
return false;
}
}
return endOfStringCheck(str, strPos) && endOfRegexCheck(regex, regexPos);
}
/**
* Matches the regex with the string with the following rules.
* * means 0 or more number of any char matches
* . means just a single any char match.
* ? means the immediate preceding character can exist or not. eg: colour? matches color and colour
*
* @param regex : The regex to match the string again
* @param str : The string to be matched.
* @return : true / false
*/
public static final boolean matches(String regex, String str) {
return matcher(regex, 0, str, 0);
}
public static void main(String[] args) {
Assert.assertTrue(Regex.matches("colou?**rs", "colours"));
Assert.assertTrue(Regex.matches("colou?**rs", "colors"));
Assert.assertTrue(Regex.matches("colou**?rs", "colours"));
// Assert.assertTrue(RegexToy.matches("colou**?rs", "colors")); <---- exlusive case.
Assert.assertTrue(Regex.matches("colou?**r", "colour"));
Assert.assertTrue(Regex.matches("colou?**r", "color"));
Assert.assertTrue(Regex.matches("colou?*", "colou"));
Assert.assertTrue(Regex.matches("colou?*", "colo"));
Assert.assertTrue(Regex.matches("colou?r", "colour"));
Assert.assertTrue(Regex.matches("colou?r", "color"));
Assert.assertTrue(Regex.matches("colou?*?r", "colour"));
Assert.assertTrue(Regex.matches("colou?*?r", "color"));
// Success cases
Assert.assertTrue(Regex.matches("", ""));
Assert.assertTrue((Regex.matches("***", "")));
String[] regexList = { "abc****", "abc", "a*b*c", "****abc", "**a**b**c****", "abc*" };
String str1 = "abc";
for (String regex : regexList) {
Assert.assertTrue(Regex.matches(regex, str1));
}
String regex = "abc****";
String[] strList1 = { "abcxyz", "abcx", "abc" };
for (String str : strList1) {
Assert.assertTrue(Regex.matches(regex, str));
}
regex = "***abc";
String[] strList2 = { "xyzabc", "xabc", "abc" };
for (String str : strList2) {
Assert.assertTrue(Regex.matches(regex, str));
}
Assert.assertTrue(Regex.matches("a.c", "abc"));
Assert.assertTrue(Regex.matches("a*.*c", "abc"));
Assert.assertTrue(Regex.matches("a*.b.*c", "axxbxxbxxc"));
// Fail cases.
Assert.assertFalse(Regex.matches("abc", "abcd"));
Assert.assertFalse(Regex.matches("*a", "abcd"));
Assert.assertFalse(Regex.matches("a", ""));
Assert.assertFalse(Regex.matches(".a*c", "abc"));
Assert.assertFalse(Regex.matches("a.*b", "abc"));
Assert.assertFalse(Regex.matches("..", "abc"));
Assert.assertFalse(Regex.matches("", "abc"));
// exceptions
try {
Regex.matches(null, "abc");
Assert.fail();
} catch (NullPointerException e) {
}
try {
Regex.matches("abc", null);
Assert.fail();
} catch (NullPointerException e) {
}
}
}