package org.apache.oro.text.regex; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2000 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" * must not be used to endorse or promote products derived from this * software without prior written permission. For written * permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache" * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their * name, without prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * . * * Portions of this software are based upon software originally written * by Daniel F. Savarese. We appreciate his contributions. */ import java.io.IOException; import java.util.*; /** * The Perl5Matcher class is used to match regular expressions * (conforming to the Perl5 regular expression syntax) generated by * Perl5Compiler. @author Daniel F. Savarese @version $Id: Perl5Matcher.java,v 1.1 2004/01/10 00:58:23 mikedemmer Exp $ * @see PatternMatcher * @see Perl5Compiler */ public final class Perl5Matcher implements PatternMatcher { private static final char __EOS = Character.MAX_VALUE; private static final int __INITIAL_NUM_OFFSETS = 20; private boolean __multiline = false, __lastSuccess = false; private char __previousChar, __input[], __originalInput[]; private Perl5Repetition __currentRep; private int __numParentheses, __bol, __eol, __currentOffset, __endOffset; private char[] __program; private int __expSize, __inputOffset, __lastParen; private int[] __beginMatchOffsets, __endMatchOffsets; private Stack __stack = new Stack(); private Perl5MatchResult __lastMatchResult = null; private static boolean __compare(char[] s1, int s1Offs, char[] s2, int s2Offs, int n) { int cnt; for(cnt = 0; cnt < n; cnt++, s1Offs++, s2Offs++) { if(s1Offs >= s1.length) return false; if(s2Offs >= s2.length) return false; if(s1[s1Offs] != s2[s2Offs]) return false; } return true; } private static int __findFirst(char[] input, int current, int endOffset, char[] mustString) { int count, saveCurrent; char ch; if(input.length == 0) return endOffset; ch = mustString[0]; // Find the offset of the first character of the must string while(current < endOffset) { if(ch == input[current]){ saveCurrent = current; count = 0; while(current < endOffset && count < mustString.length) { if(mustString[count] != input[current]) break; ++count; ++current; } current = saveCurrent; if(count >= mustString.length) break; } ++current; } return current; } private void __pushState(int parenFloor) { int[] state; int stateEntries, paren; stateEntries = 3*(__expSize - parenFloor); if(stateEntries <= 0) state = new int[3]; else state = new int[stateEntries + 3]; state[0] = __expSize; state[1] = __lastParen; state[2] = __inputOffset; for(paren = __expSize; paren > parenFloor; paren-=3, stateEntries-=3) { state[stateEntries] = __endMatchOffsets[paren]; state[stateEntries + 1] = __beginMatchOffsets[paren]; state[stateEntries + 2] = paren; } __stack.push(state); } private void __popState() { int[] state; int entry, paren; state = (int[])__stack.pop(); __expSize = state[0]; __lastParen = state[1]; __inputOffset = state[2]; for(entry = 3; entry < state.length; entry+=3) { paren = state[entry + 2]; __beginMatchOffsets[paren] = state[entry + 1]; if(paren <= __lastParen) __endMatchOffsets[paren] = state[entry]; } for(paren = __lastParen + 1; paren <= __numParentheses; paren++) { if(paren > __expSize) __beginMatchOffsets[paren] = OpCode._NULL_OFFSET; __endMatchOffsets[paren] = OpCode._NULL_OFFSET; } } // Initialize globals needed before calling __tryExpression for first time private void __initInterpreterGlobals(Perl5Pattern expression, char[] input, int beginOffset, int endOffset) { __input = input; __endOffset = endOffset; __currentRep = new Perl5Repetition(); __currentRep._numInstances = 0; __currentRep._lastRepetition = null; __program = expression._program; __stack.setSize(0); if(beginOffset == 0) __previousChar = '\n'; else { __previousChar = input[beginOffset - 1]; if(!__multiline && __previousChar == '\n') __previousChar = '\0'; } __numParentheses = expression._numParentheses; __currentOffset = beginOffset; __bol = beginOffset; __eol = endOffset; // Ok, here we're using endOffset as a temporary variable. endOffset = __numParentheses + 1; if(__beginMatchOffsets == null || endOffset > __beginMatchOffsets.length) { if(endOffset < __INITIAL_NUM_OFFSETS) endOffset = __INITIAL_NUM_OFFSETS; __beginMatchOffsets = new int[endOffset]; __endMatchOffsets = new int[endOffset]; } } // Set the match result information. Only call this if we successfully // matched. private void __setLastMatchResult() { int offs; //endOffset+=dontTry; __lastMatchResult = new Perl5MatchResult(__numParentheses + 1); // This can happen when using Perl5StreamInput if(__endMatchOffsets[0] > __originalInput.length) throw new ArrayIndexOutOfBoundsException(); __lastMatchResult._match = new String(__originalInput, __beginMatchOffsets[0], __endMatchOffsets[0] - __beginMatchOffsets[0]); __lastMatchResult._matchBeginOffset = __beginMatchOffsets[0]; while(__numParentheses >= 0) { offs = __beginMatchOffsets[__numParentheses]; if(offs >= 0) __lastMatchResult._beginGroupOffset[__numParentheses] = offs - __lastMatchResult._matchBeginOffset; else __lastMatchResult._beginGroupOffset[__numParentheses] = OpCode._NULL_OFFSET; offs = __endMatchOffsets[__numParentheses]; if(offs >= 0) __lastMatchResult._endGroupOffset[__numParentheses] = offs - __lastMatchResult._matchBeginOffset; else __lastMatchResult._endGroupOffset[__numParentheses] = OpCode._NULL_OFFSET; --__numParentheses; } // Free up for garbage collection __originalInput = null; } // Expects to receive a valid regular expression program. No checking // is done to ensure validity. // __originalInput must be set before calling this method for // __lastMatchResult to be set correctly. private boolean __interpret(Perl5Pattern expression, char[] input, int beginOffset, int endOffset) { boolean success; int minLength = 0, dontTry = 0, offset; char ch, mustString[]; __initInterpreterGlobals(expression, input, beginOffset, endOffset); success = false; mustString = expression._mustString; _mainLoop: while(true) { if(mustString != null && ((expression._anchor & Perl5Pattern._OPT_ANCH) == 0 || (__multiline && expression._back >= 0))) { __currentOffset = __findFirst(__input, __currentOffset, endOffset, mustString); if(__currentOffset >= endOffset) { if((expression._options & Perl5Compiler.READ_ONLY_MASK) == 0) expression._mustUtility++; success = false; break _mainLoop; } else if(expression._back >= 0) { __currentOffset-=expression._back; if(__currentOffset < beginOffset) __currentOffset = beginOffset; minLength = expression._back + mustString.length; } else if(!expression._isExpensive && (expression._options & Perl5Compiler.READ_ONLY_MASK) == 0 && (--expression._mustUtility < 0)) { // Be careful! The preceding logical expression is constructed // so that mustUtility is only decremented if the expression is // compiled without READ_ONLY_MASK. mustString = expression._mustString = null; __currentOffset = beginOffset; } else { __currentOffset = beginOffset; minLength = mustString.length; } } if((expression._anchor & Perl5Pattern._OPT_ANCH) != 0) { if(__tryExpression(expression, beginOffset)) { success = true; break _mainLoop; } else if(__multiline || (expression._anchor & Perl5Pattern._OPT_IMPLICIT) != 0) { if(minLength > 0) dontTry = minLength - 1; endOffset-=dontTry; if(__currentOffset > beginOffset) --__currentOffset; while(__currentOffset < endOffset) { if(__input[__currentOffset++] == '\n') { if(__currentOffset < endOffset && __tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } } } } break _mainLoop; } if(expression._startString != null) { mustString = expression._startString; if((expression._anchor & Perl5Pattern._OPT_SKIP) != 0) { ch = mustString[0]; while(__currentOffset < endOffset) { if(ch == __input[__currentOffset]) { if(__tryExpression(expression, __currentOffset)){ success = true; break _mainLoop; } ++__currentOffset; while(__currentOffset < endOffset && __input[__currentOffset] == ch) ++__currentOffset; } ++__currentOffset; } } else { while((__currentOffset = __findFirst(__input, __currentOffset, endOffset, mustString)) < endOffset){ if(__tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } ++__currentOffset; } } break _mainLoop; } if((offset = expression._startClassOffset) != OpCode._NULL_OFFSET) { boolean doEvery, tmp; doEvery = ((expression._anchor & Perl5Pattern._OPT_SKIP) == 0); if(minLength > 0) dontTry = minLength - 1; endOffset -= dontTry; tmp = true; switch(__program[offset]) { case OpCode._ANYOF: offset = OpCode._getOperand(offset); while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(ch < 256 && (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) { if(tmp && __tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._BOUND: if(minLength > 0) { ++dontTry; --endOffset; } if(__currentOffset != beginOffset) { ch = __input[__currentOffset - 1]; tmp = OpCode._isWordCharacter(ch); } else tmp = OpCode._isWordCharacter(__previousChar); while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(tmp != OpCode._isWordCharacter(ch)){ tmp = !tmp; if(__tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } } ++__currentOffset; } if((minLength > 0 || tmp) && __tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } break; case OpCode._NBOUND: if(minLength > 0) { ++dontTry; --endOffset; } if(__currentOffset != beginOffset) { ch = __input[__currentOffset - 1]; tmp = OpCode._isWordCharacter(ch); } else tmp = OpCode._isWordCharacter(__previousChar); while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(tmp != OpCode._isWordCharacter(ch)) tmp = !tmp; else if(__tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } ++__currentOffset; } if((minLength > 0 || !tmp) && __tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } break; case OpCode._ALNUM: while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(OpCode._isWordCharacter(ch)) { if(tmp && __tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._NALNUM: while(__currentOffset < endOffset) { ch = __input[__currentOffset]; if(!OpCode._isWordCharacter(ch)) { if(tmp && __tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._SPACE: while(__currentOffset < endOffset) { if(Character.isWhitespace(__input[__currentOffset])) { if(tmp && __tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._NSPACE: while(__currentOffset < endOffset) { if(!Character.isWhitespace(__input[__currentOffset])) { if(tmp && __tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._DIGIT: while(__currentOffset < endOffset) { if(Character.isDigit(__input[__currentOffset])) { if(tmp && __tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; case OpCode._NDIGIT: while(__currentOffset < endOffset) { if(!Character.isDigit(__input[__currentOffset])) { if(tmp && __tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } else tmp = doEvery; } else tmp = true; ++__currentOffset; } break; } // end switch } else { if(minLength > 0) dontTry = minLength - 1; endOffset-=dontTry; do { if(__tryExpression(expression, __currentOffset)) { success = true; break _mainLoop; } } while(__currentOffset++ < endOffset); } break _mainLoop; } // end while __lastSuccess = success; __lastMatchResult = null; return success; } private boolean __tryExpression(Perl5Pattern expression, int offset) { int count; __inputOffset = offset; __lastParen = 0; __expSize = 0; if(__numParentheses > 0) { for(count=0; count <= __numParentheses; count++) { __beginMatchOffsets[count] = OpCode._NULL_OFFSET; __endMatchOffsets[count] = OpCode._NULL_OFFSET; } } if(__match(1)){ __beginMatchOffsets[0] = offset; __endMatchOffsets[0] = __inputOffset; return true; } return false; } private int __repeat(int offset, int max) { int scan, eol, operand, ret; char ch; scan = __inputOffset; eol = __eol; if(max != Character.MAX_VALUE && max < eol - scan) eol = scan + max; operand = OpCode._getOperand(offset); switch(__program[offset]) { case OpCode._ANY: while(scan < eol && __input[scan] != '\n') ++scan; break; case OpCode._SANY: scan = eol; break; case OpCode._EXACTLY: ++operand; while(scan < eol && __program[operand] == __input[scan]) ++scan; break; case OpCode._ANYOF: if(scan < eol && (ch = __input[scan]) < 256) { while((__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) { if(++scan < eol) ch = __input[scan]; else break; } } break; case OpCode._ALNUM: while(scan < eol && OpCode._isWordCharacter(__input[scan])) ++scan; break; case OpCode._NALNUM: while(scan < eol && !OpCode._isWordCharacter(__input[scan])) ++scan; break; case OpCode._SPACE: while(scan < eol && Character.isWhitespace(__input[scan])) ++scan; break; case OpCode._NSPACE: while(scan < eol && !Character.isWhitespace(__input[scan])) ++scan; break; case OpCode._DIGIT: while(scan < eol && Character.isDigit(__input[scan])) ++scan; break; case OpCode._NDIGIT: while(scan < eol && !Character.isDigit(__input[scan])) ++scan; break; default: break; } ret = scan - __inputOffset; __inputOffset = scan; return ret; } private boolean __match(int offset) { char nextChar, op; int scan, next, input, maxScan, current, line, arg; boolean inputRemains = true, minMod = false; Perl5Repetition rep; input = __inputOffset; inputRemains = (input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); scan = offset; maxScan = __program.length; while(scan < maxScan /*&& scan > 0*/){ next = OpCode._getNext(__program, scan); switch(op = __program[scan]) { case OpCode._BOL: if(input == __bol ? __previousChar == '\n' : (__multiline && (inputRemains || input < __eol) && __input[input - 1] == '\n')) break; return false; case OpCode._MBOL: if(input == __bol ? __previousChar == '\n' : ((inputRemains || input < __eol) && __input[input - 1] == '\n')) break; return false; case OpCode._SBOL: if(input == __bol && __previousChar == '\n') break; return false; case OpCode._GBOL: if(input == __bol) break; return true; case OpCode._EOL : if((inputRemains || input < __eol) && nextChar != '\n') return false; if(!__multiline && __eol - input > 1) return false; break; case OpCode._MEOL: if((inputRemains || input < __eol) && nextChar != '\n') return false; break; case OpCode._SEOL: if((inputRemains || input < __eol) && nextChar != '\n') return false; if(__eol - input > 1) return false; break; case OpCode._SANY: if(!inputRemains && input >= __eol) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._ANY: if((!inputRemains && input >= __eol) || nextChar == '\n') return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._EXACTLY: current = OpCode._getOperand(scan); line = __program[current++]; if(__program[current] != nextChar) return false; if(__eol - input < line) return false; if(line > 1 && !__compare(__program, current, __input, input, line)) return false; input+=line; inputRemains = (input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._ANYOF: current = OpCode._getOperand(scan); if(nextChar == __EOS && inputRemains) nextChar = __input[input]; if(nextChar >= 256 || (__program[current + (nextChar >> 4)] & (1 << (nextChar & 0xf))) != 0) return false; if(!inputRemains && input >= __eol) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._ALNUM: if(!inputRemains) return false; if(!OpCode._isWordCharacter(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._NALNUM: if(!inputRemains && input >= __eol) return false; if(OpCode._isWordCharacter(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._NBOUND: case OpCode._BOUND: boolean a, b; if(input == __bol) a = OpCode._isWordCharacter(__previousChar); else a = OpCode._isWordCharacter(__input[input - 1]); b = OpCode._isWordCharacter(nextChar); if((a == b) == (__program[scan] == OpCode._BOUND)) return false; break; case OpCode._SPACE: if(!inputRemains && input >= __eol) return false; if(!Character.isWhitespace(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._NSPACE: if(!inputRemains) return false; if(Character.isWhitespace(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._DIGIT: if(!Character.isDigit(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._NDIGIT: if(!inputRemains && input >= __eol) return false; if(Character.isDigit(nextChar)) return false; inputRemains = (++input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._REF: arg = OpCode._getArg1(__program, scan); current = __beginMatchOffsets[arg]; if(current == OpCode._NULL_OFFSET) return false; if(__endMatchOffsets[arg] == OpCode._NULL_OFFSET) return false; if(current == __endMatchOffsets[arg]) break; if(__input[current] != nextChar) return false; line = __endMatchOffsets[arg] - current; if(input + line > __eol) return false; if(line > 1 && !__compare(__input, current, __input, input, line)) return false; input+=line; inputRemains = (input < __endOffset); nextChar = (inputRemains ? __input[input] : __EOS); break; case OpCode._NOTHING: break; case OpCode._BACK: break; case OpCode._OPEN: arg = OpCode._getArg1(__program, scan); __beginMatchOffsets[arg] = input; if(arg > __expSize) __expSize = arg; break; case OpCode._CLOSE: arg = OpCode._getArg1(__program, scan); __endMatchOffsets[arg] = input; if(arg > __lastParen) __lastParen = arg; break; case OpCode._CURLYX: rep = new Perl5Repetition(); rep._lastRepetition = __currentRep; __currentRep = rep; rep._parenFloor = __lastParen; rep._numInstances = -1; rep._min = OpCode._getArg1(__program, scan); rep._max = OpCode._getArg2(__program, scan); rep._scan = OpCode._getNextOperator(scan) + 2; rep._next = next; rep._minMod = minMod; // Must initialize to -1 because if we initialize to 0 and are // at the beginning of the input the OpCode._WHILEM case will // not work right. rep._lastLocation = -1; __inputOffset = input; // use minMod as temporary minMod = __match(OpCode._getPrevOperator(next)); // leave scope call not pertinent? __currentRep = rep._lastRepetition; return minMod; case OpCode._WHILEM: rep = __currentRep; arg = rep._numInstances + 1; __inputOffset = input; if(input == rep._lastLocation) { __currentRep = rep._lastRepetition; line = __currentRep._numInstances; if(__match(rep._next)) return true; __currentRep._numInstances = line; __currentRep = rep; return false; } if(arg < rep._min) { rep._numInstances = arg; rep._lastLocation = input; if(__match(rep._scan)) return true; rep._numInstances = arg - 1; return false; } if(rep._minMod) { __currentRep = rep._lastRepetition; line = __currentRep._numInstances; if(__match(rep._next)) return true; __currentRep._numInstances = line; __currentRep = rep; if(arg >= rep._max) return false; __inputOffset = input; rep._numInstances = arg; rep._lastLocation = input; if(__match(rep._scan)) return true; rep._numInstances = arg - 1; return false; } if(arg < rep._max) { __pushState(rep._parenFloor); rep._numInstances = arg; rep._lastLocation = input; if(__match(rep._scan)) return true; __popState(); __inputOffset = input; } __currentRep = rep._lastRepetition; line = __currentRep._numInstances; if(__match(rep._next)) return true; rep._numInstances = line; __currentRep = rep; rep._numInstances = arg - 1; return false; case OpCode._BRANCH: if(__program[next] != OpCode._BRANCH) next = OpCode._getNextOperator(scan); else { int lastParen; lastParen = __lastParen; do { __inputOffset = input; if(__match(OpCode._getNextOperator(scan))) return true; for(arg = __lastParen; arg > lastParen; --arg) //__endMatchOffsets[arg] = 0; __endMatchOffsets[arg] = OpCode._NULL_OFFSET; __lastParen = arg; scan = OpCode._getNext(__program, scan); } while(scan != OpCode._NULL_OFFSET && __program[scan] == OpCode._BRANCH); return false; } break; case OpCode._MINMOD: minMod = true; break; case OpCode._CURLY: case OpCode._STAR: case OpCode._PLUS: if(op == OpCode._CURLY) { line = OpCode._getArg1(__program, scan); arg = OpCode._getArg2(__program, scan); scan = OpCode._getNextOperator(scan) + 2; } else if(op == OpCode._STAR) { line = 0; arg = Character.MAX_VALUE; scan = OpCode._getNextOperator(scan); } else { line = 1; arg = Character.MAX_VALUE; scan = OpCode._getNextOperator(scan); } if(__program[next] == OpCode._EXACTLY) { nextChar = __program[OpCode._getOperand(next) + 1]; current = 0; } else { nextChar = __EOS; current = -1000; } __inputOffset = input; if(minMod) { minMod = false; if(line > 0 && __repeat(scan, line) < line) return false; while(arg >= line || (arg == Character.MAX_VALUE && line > 0)) { // there may be a bug here with respect to // __inputOffset >= __input.length, but it seems to be right for // now. the issue is with __inputOffset being reset later. // is this test really supposed to happen here? if(current == -1000 || __inputOffset >= __endOffset || __input[__inputOffset] == nextChar) { if(__match(next)) return true; } __inputOffset = input + line; if(__repeat(scan, 1) != 0) { ++line; __inputOffset = input + line; } else return false; } } else { arg = __repeat(scan, arg); if(line < arg && OpCode._opType[__program[next]] == OpCode._EOL && (!__multiline || __program[next] == OpCode._SEOL)) line = arg; while(arg >= line) { // there may be a bug here with respect to // __inputOffset >= __input.length, but it seems to be right for // now. the issue is with __inputOffset being reset later. // is this test really supposed to happen here? if(current == -1000 || __inputOffset >= __endOffset || __input[__inputOffset] == nextChar) { if(__match(next)) return true; } --arg; __inputOffset = input + arg; } } return false; case OpCode._SUCCEED: case OpCode._END: __inputOffset = input; // This enforces the rule that two consecutive matches cannot have // the same end offset. if(__inputOffset == __lastMatchInputEndOffset) return false; return true; case OpCode._IFMATCH: __inputOffset = input; scan = OpCode._getNextOperator(scan); if(!__match(scan)) return false; break; case OpCode._UNLESSM: __inputOffset = input; scan = OpCode._getNextOperator(scan); if(__match(scan)) return false; break; default: // todo: Need to throw an exception here. } // end switch //scan = (next > 0 ? next : 0); scan = next; } // end while scan return false; } /** * Set whether or not subsequent calls to {@link #matches matches()} * or {@link #contains contains()} should treat the input as * consisting of multiple lines. The default behavior is for * input to be treated as consisting of multiple lines. This method * should only be called if the Perl5Pattern used for a match was * compiled without either of the Perl5Compiler.MULTILINE_MASK or * Perl5Compiler.SINGLELINE_MASK flags, and you want to alter the * behavior of how the ^, $, and . metacharacters are * interpreted on the fly. The compilation options used when compiling * a pattern ALWAYS override the behavior specified by setMultiline(). See * {@link Perl5Compiler} for more details. *

* @param multiline If set to true treats the input as consisting of * multiple lines with respect to the ^ and $ * metacharacters. If set to false treats the input as consisting * of a single line with respect to the ^ and $ * metacharacters. */ public void setMultiline(boolean multiline) { __multiline = multiline; } /** * @return True if the matcher is treating input as consisting of multiple * lines with respect to the ^ and $ metacharacters, * false otherwise. */ public boolean isMultiline() { return __multiline; } char[] _toLower(char[] input) { int current; char[] inp; // todo: // Certainly not the best way to do case insensitive matching. // Must definitely change this in some way, but for now we // do what Perl does and make a copy of the input, converting // it all to lowercase. This is truly better handled in the // compilation phase. inp = new char[input.length]; System.arraycopy(input, 0, inp, 0, input.length); input = inp; // todo: Need to inline toLowerCase() for(current = 0; current < input.length; current++) if(Character.isUpperCase(input[current])) input[current] = Character.toLowerCase(input[current]); return input; } /** * Determines if a prefix of a string (represented as a char[]) * matches a given pattern, starting from a given offset into the string. * If a prefix of the string matches the pattern, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The char[] to test for a prefix match. * @param pattern The Pattern to be matched. * @param offset The offset at which to start searching for the prefix. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(char[] input, Pattern pattern, int offset) { Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = input; if(expression._isCaseInsensitive) input = _toLower(input); __initInterpreterGlobals(expression, input, offset, input.length); __lastSuccess = __tryExpression(expression, offset); __lastMatchResult = null; return __lastSuccess; } /** * Determines if a prefix of a string (represented as a char[]) * matches a given pattern. * If a prefix of the string matches the pattern, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The char[] to test for a prefix match. * @param pattern The Pattern to be matched. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(char[] input, Pattern pattern) { return matchesPrefix(input, pattern, 0); } /** * Determines if a prefix of a string matches a given pattern. * If a prefix of the string matches the pattern, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The String to test for a prefix match. * @param pattern The Pattern to be matched. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(String input, Pattern pattern) { return matchesPrefix(input.toCharArray(), pattern, 0); } /** * Determines if a prefix of a PatternMatcherInput instance * matches a given pattern. If there is a match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. Unlike the * {@link #contains(PatternMatcherInput, Pattern)} * method, the current offset of the PatternMatcherInput argument * is not updated. However, unlike the * {@link #matches matches(PatternMatcherInput, Pattern)} method, * matchesPrefix() will start its search from the current offset * rather than the begin offset of the PatternMatcherInput. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The PatternMatcherInput to test for a prefix match. * @param pattern The Pattern to be matched. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(PatternMatcherInput input, Pattern pattern) { char[] inp; Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = input._originalBuffer; if(expression._isCaseInsensitive) { if(input._toLowerBuffer == null) input._toLowerBuffer = _toLower(__originalInput); inp = input._toLowerBuffer; } else inp = __originalInput; __initInterpreterGlobals(expression, inp, input._currentOffset, input._endOffset); __lastSuccess = __tryExpression(expression, input._currentOffset); __lastMatchResult = null; return __lastSuccess; } /** * Determines if a string (represented as a char[]) exactly * matches a given pattern. If * there is an exact match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. The pattern must be * a Perl5Pattern instance, otherwise a ClassCastException will * be thrown. You are not required to, and indeed should NOT try to * (for performance reasons), catch a ClassCastException because it * will never be thrown as long as you use a Perl5Pattern as the pattern * parameter. *

* Note: matches() is not the same as sticking a ^ in front of * your expression and a $ at the end of your expression in Perl5 * and using the =~ operator, even though in many cases it will be * equivalent. matches() literally looks for an exact match according * to the rules of Perl5 expression matching. Therefore, if you have * a pattern foo|foot and are matching the input foot * it will not produce an exact match. But foot|foo will * produce an exact match for either foot or foo. * Remember, Perl5 regular expressions do not match the longest * possible match. From the perlre manpage: *

* Alternatives are tried from left to right, so the first * alternative found for which the entire expression matches, * is the one that is chosen. This means that alternatives * are not necessarily greedy. For example: when matching * foo|foot against "barefoot", only the "foo" part will * match, as that is the first alternative tried, and it * successfully matches the target string. *
*

* @param input The char[] to test for an exact match. * @param pattern The Perl5Pattern to be matched. * @return True if input matches pattern, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean matches(char[] input, Pattern pattern) { Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = input; if(expression._isCaseInsensitive) input = _toLower(input); /* if(__interpret(expression, input, 0, input.length)) { if(__lastMatchResult.beginOffset(0) == 0 && __lastMatchResult.endOffset(0) == input.length) return true; } */ __initInterpreterGlobals(expression, input, 0, input.length); __lastSuccess = (__tryExpression(expression, 0) && __endMatchOffsets[0] == input.length); __lastMatchResult = null; return __lastSuccess; } /** * Determines if a string exactly matches a given pattern. If * there is an exact match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. The pattern must be * a Perl5Pattern instance, otherwise a ClassCastException will * be thrown. You are not required to, and indeed should NOT try to * (for performance reasons), catch a ClassCastException because it * will never be thrown as long as you use a Perl5Pattern as the pattern * parameter. *

* Note: matches() is not the same as sticking a ^ in front of * your expression and a $ at the end of your expression in Perl5 * and using the =~ operator, even though in many cases it will be * equivalent. matches() literally looks for an exact match according * to the rules of Perl5 expression matching. Therefore, if you have * a pattern foo|foot and are matching the input foot * it will not produce an exact match. But foot|foo will * produce an exact match for either foot or foo. * Remember, Perl5 regular expressions do not match the longest * possible match. From the perlre manpage: *

* Alternatives are tried from left to right, so the first * alternative found for which the entire expression matches, * is the one that is chosen. This means that alternatives * are not necessarily greedy. For example: when matching * foo|foot against "barefoot", only the "foo" part will * match, as that is the first alternative tried, and it * successfully matches the target string. *
*

* @param input The String to test for an exact match. * @param pattern The Perl5Pattern to be matched. * @return True if input matches pattern, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean matches(String input, Pattern pattern) { return matches(input.toCharArray(), pattern); } /** * Determines if the contents of a PatternMatcherInput instance * exactly matches a given pattern. If * there is an exact match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. Unlike the * {@link #contains(PatternMatcherInput, Pattern)} * method, the current offset of the PatternMatcherInput argument * is not updated. You should remember that the region between * the begin (NOT the current) and end offsets of the PatternMatcherInput * will be tested for an exact match. *

* The pattern must be a Perl5Pattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * a Perl5Pattern as the pattern parameter. *

* Note: matches() is not the same as sticking a ^ in front of * your expression and a $ at the end of your expression in Perl5 * and using the =~ operator, even though in many cases it will be * equivalent. matches() literally looks for an exact match according * to the rules of Perl5 expression matching. Therefore, if you have * a pattern foo|foot and are matching the input foot * it will not produce an exact match. But foot|foo will * produce an exact match for either foot or foo. * Remember, Perl5 regular expressions do not match the longest * possible match. From the perlre manpage: *

* Alternatives are tried from left to right, so the first * alternative found for which the entire expression matches, * is the one that is chosen. This means that alternatives * are not necessarily greedy. For example: when matching * foo|foot against "barefoot", only the "foo" part will * match, as that is the first alternative tried, and it * successfully matches the target string. *
*

* @param input The PatternMatcherInput to test for a match. * @param pattern The Perl5Pattern to be matched. * @return True if input matches pattern, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean matches(PatternMatcherInput input, Pattern pattern) { char[] inp; Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = input._originalBuffer; if(expression._isCaseInsensitive) { if(input._toLowerBuffer == null) input._toLowerBuffer = _toLower(__originalInput); inp = input._toLowerBuffer; } else inp = __originalInput; /* if(__interpret(expression, inp, input._beginOffset, input._endOffset)) { // debug //System.err.println("contains: " + getMatch()); //System.err.println(__lastMatchResult.beginOffset(0) + "-" + //__lastMatchResult.endOffset(0)); //System.err.println(input._beginOffset + "-" + //input._endOffset); if(__lastMatchResult.beginOffset(0) == input._beginOffset && __lastMatchResult.endOffset(0) == input._endOffset) return true; // Handle special case. if(input.length() == 0 || (input._beginOffset == input._endOffset)) return true; } */ __initInterpreterGlobals(expression, inp, input._beginOffset, input._endOffset); __lastMatchResult = null; if(__tryExpression(expression, input._beginOffset)) { if(__endMatchOffsets[0] == input._endOffset || input.length() == 0 || input._beginOffset == input._endOffset) { __lastSuccess = true; return true; } } __lastSuccess = false; return false; } /** * Determines if a string contains a pattern. If the pattern is * matched by some substring of the input, a MatchResult instance * representing the first such match is made acessible via * {@link #getMatch()}. If you want to access * subsequent matches you should either use a PatternMatcherInput object * or use the offset information in the MatchResult to create a substring * representing the remaining input. Using the MatchResult offset * information is the recommended method of obtaining the parts of the * string preceeding the match and following the match. *

* The pattern must be a Perl5Pattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * a Perl5Pattern as the pattern parameter. *

* @param input The String to test for a match. * @param pattern The Perl5Pattern to be matched. * @return True if the input contains a pattern match, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean contains(String input, Pattern pattern) { /* char[] inp; Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = inp = input.toCharArray(); if(expression._isCaseInsensitive) //_toLower(inp, false); inp = _toLower(inp, false); return __interpret(expression, inp, 0, inp.length); */ return contains(input.toCharArray(), pattern); } /** * Determines if a string (represented as a char[]) contains a pattern. * If the pattern is * matched by some substring of the input, a MatchResult instance * representing the first such match is made acessible via * {@link #getMatch()}. If you want to access * subsequent matches you should either use a PatternMatcherInput object * or use the offset information in the MatchResult to create a substring * representing the remaining input. Using the MatchResult offset * information is the recommended method of obtaining the parts of the * string preceeding the match and following the match. *

* The pattern must be a Perl5Pattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * a Perl5Pattern as the pattern parameter. *

* @param input The char[] to test for a match. * @param pattern The Perl5Pattern to be matched. * @return True if the input contains a pattern match, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean contains(char[] input, Pattern pattern) { Perl5Pattern expression; expression = (Perl5Pattern)pattern; __originalInput = input; if(expression._isCaseInsensitive) input = _toLower(input); return __interpret(expression, input, 0, input.length); } private static final int __DEFAULT_LAST_MATCH_END_OFFSET = -100; private int __lastMatchInputEndOffset = __DEFAULT_LAST_MATCH_END_OFFSET; /** * Determines if the contents of a PatternMatcherInput, starting from the * current offset of the input contains a pattern. * If a pattern match is found, a MatchResult * instance representing the first such match is made acessible via * {@link #getMatch()}. The current offset of the * PatternMatcherInput is set to the offset corresponding to the end * of the match, so that a subsequent call to this method will continue * searching where the last call left off. You should remember that the * region between the begin and end offsets of the PatternMatcherInput are * considered the input to be searched, and that the current offset * of the PatternMatcherInput reflects where a search will start from. * Matches extending beyond the end offset of the PatternMatcherInput * will not be matched. In other words, a match must occur entirely * between the begin and end offsets of the input. See * {@link PatternMatcherInput} for more details. *

* As a side effect, if a match is found, the PatternMatcherInput match * offset information is updated. See the * {@link PatternMatcherInput#setMatchOffsets(int, int)} * method for more details. *

* The pattern must be a Perl5Pattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * a Perl5Pattern as the pattern parameter. *

* This method is usually used in a loop as follows: *

   * PatternMatcher matcher;
   * PatternCompiler compiler;
   * Pattern pattern;
   * PatternMatcherInput input;
   * MatchResult result;
   *
   * compiler = new Perl5Compiler();
   * matcher  = new Perl5Matcher();
   *
   * try {
   *   pattern = compiler.compile(somePatternString);
   * } catch(MalformedPatternException e) {
   *   System.err.println("Bad pattern.");
   *   System.err.println(e.getMessage());
   *   return;
   * }
   *
   * input   = new PatternMatcherInput(someStringInput);
   *
   * while(matcher.contains(input, pattern)) {
   *   result = matcher.getMatch();  
   *   // Perform whatever processing on the result you want.
   * }
   *
   * 
*

* @param input The PatternMatcherInput to test for a match. * @param pattern The Pattern to be matched. * @return True if the input contains a pattern match, false otherwise. * @exception ClassCastException If a Pattern instance other than a * Perl5Pattern is passed as the pattern parameter. */ public boolean contains(PatternMatcherInput input, Pattern pattern) { char[] inp; Perl5Pattern expression; boolean matchFound; //if(input.length() > 0) { // We want to allow a null string to match at the end of the input // which is why we don't check endOfInput. Not sure if this is a // safe thing to do or not. if(input._currentOffset > input._endOffset) return false; //} /* else if(input._endOfInput()) return false; */ expression = (Perl5Pattern)pattern; __originalInput = input._originalBuffer; // Todo: // Really should only reduce to lowercase that part of the // input that is necessary, instead of the whole thing. // Adjust MatchResult offsets accordingly. Actually, pass an adjustment // value to __interpret. __originalInput = input._originalBuffer; if(expression._isCaseInsensitive) { if(input._toLowerBuffer == null) input._toLowerBuffer = _toLower(__originalInput); inp = input._toLowerBuffer; } else inp = __originalInput; __lastMatchInputEndOffset = input.getMatchEndOffset(); matchFound = __interpret(expression, inp, input._currentOffset, input._endOffset); if(matchFound) { input.setCurrentOffset(__endMatchOffsets[0]); input.setMatchOffsets(__beginMatchOffsets[0], __endMatchOffsets[0]); } else { input.setCurrentOffset(input._endOffset + 1); } // Restore so it doesn't interfere with other unrelated matches. __lastMatchInputEndOffset = __DEFAULT_LAST_MATCH_END_OFFSET; return matchFound; } /** * Fetches the last match found by a call to a matches() or contains() * method. If you plan on modifying the original search input, you * must call this method BEFORE you modify the original search input, * as a lazy evaluation technique is used to create the MatchResult. * This reduces the cost of pattern matching when you don't care about * the actual match and only care if the pattern occurs in the input. * Otherwise, a MatchResult would be created for every match found, * whether or not the MatchResult was later used by a call to getMatch(). *

* @return A MatchResult instance containing the pattern match found * by the last call to any one of the matches() or contains() * methods. If no match was found by the last call, returns * null. */ public MatchResult getMatch() { if(!__lastSuccess) return null; if(__lastMatchResult == null) __setLastMatchResult(); return __lastMatchResult; } }