001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025/** 026 * Tokenizes a string based on delimiters (separators) 027 * and supporting quoting and ignored character concepts. 028 * <p> 029 * This class can split a String into many smaller strings. It aims 030 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 031 * however it offers much more control and flexibility including implementing 032 * the <code>ListIterator</code> interface. By default, it is set up 033 * like <code>StringTokenizer</code>. 034 * <p> 035 * The input String is split into a number of <i>tokens</i>. 036 * Each token is separated from the next String by a <i>delimiter</i>. 037 * One or more delimiter characters must be specified. 038 * <p> 039 * Each token may be surrounded by quotes. 040 * The <i>quote</i> matcher specifies the quote character(s). 041 * A quote may be escaped within a quoted section by duplicating itself. 042 * <p> 043 * Between each token and the delimiter are potentially characters that need trimming. 044 * The <i>trimmer</i> matcher specifies these characters. 045 * One usage might be to trim whitespace characters. 046 * <p> 047 * At any point outside the quotes there might potentially be invalid characters. 048 * The <i>ignored</i> matcher specifies these characters to be removed. 049 * One usage might be to remove new line characters. 050 * <p> 051 * Empty tokens may be removed or returned as null. 052 * <pre> 053 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 054 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 056 * </pre> 057 * 058 * <table> 059 * <caption>StrTokenizer properties and options</caption> 060 * <tr> 061 * <th>Property</th><th>Type</th><th>Default</th> 062 * </tr> 063 * <tr> 064 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 065 * </tr> 066 * <tr> 067 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 068 * </tr> 069 * <tr> 070 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 071 * </tr> 072 * <tr> 073 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 074 * </tr> 075 * <tr> 076 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 077 * </tr> 078 * </table> 079 * 080 * @since 1.0 081 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0. 082 */ 083@Deprecated 084public class StrTokenizer implements ListIterator<String>, Cloneable { 085 086 /** Comma separated values tokenizer internal variable. */ 087 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 088 /** Tab separated values tokenizer internal variable. */ 089 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 090 static { 091 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 092 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 093 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 094 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 095 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 096 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 097 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 098 099 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 100 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 101 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 102 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 103 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 104 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 105 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 106 } 107 108 /** The text to work on. */ 109 private char[] chars; 110 /** The parsed tokens. */ 111 private String[] tokens; 112 /** The current iteration position. */ 113 private int tokenPos; 114 115 /** The delimiter matcher. */ 116 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 117 /** The quote matcher. */ 118 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 119 /** The ignored matcher. */ 120 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 121 /** The trimmer matcher. */ 122 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 123 124 /** Whether to return empty tokens as null. */ 125 private boolean emptyAsNull = false; 126 /** Whether to ignore empty tokens. */ 127 private boolean ignoreEmptyTokens = true; 128 129 //----------------------------------------------------------------------- 130 131 /** 132 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 133 * 134 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 135 */ 136 private static StrTokenizer getCSVClone() { 137 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 138 } 139 140 /** 141 * Gets a new tokenizer instance which parses Comma Separated Value strings 142 * initializing it with the given input. The default for CSV processing 143 * will be trim whitespace from both ends (which can be overridden with 144 * the setTrimmer method). 145 * <p> 146 * You must call a "reset" method to set the string which you want to parse. 147 * @return a new tokenizer instance which parses Comma Separated Value strings 148 */ 149 public static StrTokenizer getCSVInstance() { 150 return getCSVClone(); 151 } 152 153 /** 154 * Gets a new tokenizer instance which parses Comma Separated Value strings 155 * initializing it with the given input. The default for CSV processing 156 * will be trim whitespace from both ends (which can be overridden with 157 * the setTrimmer method). 158 * 159 * @param input the text to parse 160 * @return a new tokenizer instance which parses Comma Separated Value strings 161 */ 162 public static StrTokenizer getCSVInstance(final String input) { 163 final StrTokenizer tok = getCSVClone(); 164 tok.reset(input); 165 return tok; 166 } 167 168 /** 169 * Gets a new tokenizer instance which parses Comma Separated Value strings 170 * initializing it with the given input. The default for CSV processing 171 * will be trim whitespace from both ends (which can be overridden with 172 * the setTrimmer method). 173 * 174 * @param input the text to parse 175 * @return a new tokenizer instance which parses Comma Separated Value strings 176 */ 177 public static StrTokenizer getCSVInstance(final char[] input) { 178 final StrTokenizer tok = getCSVClone(); 179 tok.reset(input); 180 return tok; 181 } 182 183 /** 184 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 185 * 186 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 187 */ 188 private static StrTokenizer getTSVClone() { 189 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 190 } 191 192 193 /** 194 * Gets a new tokenizer instance which parses Tab Separated Value strings. 195 * The default for CSV processing will be trim whitespace from both ends 196 * (which can be overridden with the setTrimmer method). 197 * <p> 198 * You must call a "reset" method to set the string which you want to parse. 199 * @return a new tokenizer instance which parses Tab Separated Value strings. 200 */ 201 public static StrTokenizer getTSVInstance() { 202 return getTSVClone(); 203 } 204 205 /** 206 * Gets a new tokenizer instance which parses Tab Separated Value strings. 207 * The default for CSV processing will be trim whitespace from both ends 208 * (which can be overridden with the setTrimmer method). 209 * @param input the string to parse 210 * @return a new tokenizer instance which parses Tab Separated Value strings. 211 */ 212 public static StrTokenizer getTSVInstance(final String input) { 213 final StrTokenizer tok = getTSVClone(); 214 tok.reset(input); 215 return tok; 216 } 217 218 /** 219 * Gets a new tokenizer instance which parses Tab Separated Value strings. 220 * The default for CSV processing will be trim whitespace from both ends 221 * (which can be overridden with the setTrimmer method). 222 * @param input the string to parse 223 * @return a new tokenizer instance which parses Tab Separated Value strings. 224 */ 225 public static StrTokenizer getTSVInstance(final char[] input) { 226 final StrTokenizer tok = getTSVClone(); 227 tok.reset(input); 228 return tok; 229 } 230 231 //----------------------------------------------------------------------- 232 /** 233 * Constructs a tokenizer splitting on space, tab, newline and form feed 234 * as per StringTokenizer, but with no text to tokenize. 235 * <p> 236 * This constructor is normally used with {@link #reset(String)}. 237 */ 238 public StrTokenizer() { 239 super(); 240 this.chars = null; 241 } 242 243 /** 244 * Constructs a tokenizer splitting on space, tab, newline and form feed 245 * as per StringTokenizer. 246 * 247 * @param input the string which is to be parsed 248 */ 249 public StrTokenizer(final String input) { 250 super(); 251 if (input != null) { 252 chars = input.toCharArray(); 253 } else { 254 chars = null; 255 } 256 } 257 258 /** 259 * Constructs a tokenizer splitting on the specified delimiter character. 260 * 261 * @param input the string which is to be parsed 262 * @param delim the field delimiter character 263 */ 264 public StrTokenizer(final String input, final char delim) { 265 this(input); 266 setDelimiterChar(delim); 267 } 268 269 /** 270 * Constructs a tokenizer splitting on the specified delimiter string. 271 * 272 * @param input the string which is to be parsed 273 * @param delim the field delimiter string 274 */ 275 public StrTokenizer(final String input, final String delim) { 276 this(input); 277 setDelimiterString(delim); 278 } 279 280 /** 281 * Constructs a tokenizer splitting using the specified delimiter matcher. 282 * 283 * @param input the string which is to be parsed 284 * @param delim the field delimiter matcher 285 */ 286 public StrTokenizer(final String input, final StrMatcher delim) { 287 this(input); 288 setDelimiterMatcher(delim); 289 } 290 291 /** 292 * Constructs a tokenizer splitting on the specified delimiter character 293 * and handling quotes using the specified quote character. 294 * 295 * @param input the string which is to be parsed 296 * @param delim the field delimiter character 297 * @param quote the field quoted string character 298 */ 299 public StrTokenizer(final String input, final char delim, final char quote) { 300 this(input, delim); 301 setQuoteChar(quote); 302 } 303 304 /** 305 * Constructs a tokenizer splitting using the specified delimiter matcher 306 * and handling quotes using the specified quote matcher. 307 * 308 * @param input the string which is to be parsed 309 * @param delim the field delimiter matcher 310 * @param quote the field quoted string matcher 311 */ 312 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 313 this(input, delim); 314 setQuoteMatcher(quote); 315 } 316 317 /** 318 * Constructs a tokenizer splitting on space, tab, newline and form feed 319 * as per StringTokenizer. 320 * 321 * @param input the string which is to be parsed, not cloned 322 */ 323 public StrTokenizer(final char[] input) { 324 super(); 325 if (input == null) { 326 this.chars = null; 327 } else { 328 this.chars = input.clone(); 329 } 330 } 331 332 /** 333 * Constructs a tokenizer splitting on the specified character. 334 * 335 * @param input the string which is to be parsed, not cloned 336 * @param delim the field delimiter character 337 */ 338 public StrTokenizer(final char[] input, final char delim) { 339 this(input); 340 setDelimiterChar(delim); 341 } 342 343 /** 344 * Constructs a tokenizer splitting on the specified string. 345 * 346 * @param input the string which is to be parsed, not cloned 347 * @param delim the field delimiter string 348 */ 349 public StrTokenizer(final char[] input, final String delim) { 350 this(input); 351 setDelimiterString(delim); 352 } 353 354 /** 355 * Constructs a tokenizer splitting using the specified delimiter matcher. 356 * 357 * @param input the string which is to be parsed, not cloned 358 * @param delim the field delimiter matcher 359 */ 360 public StrTokenizer(final char[] input, final StrMatcher delim) { 361 this(input); 362 setDelimiterMatcher(delim); 363 } 364 365 /** 366 * Constructs a tokenizer splitting on the specified delimiter character 367 * and handling quotes using the specified quote character. 368 * 369 * @param input the string which is to be parsed, not cloned 370 * @param delim the field delimiter character 371 * @param quote the field quoted string character 372 */ 373 public StrTokenizer(final char[] input, final char delim, final char quote) { 374 this(input, delim); 375 setQuoteChar(quote); 376 } 377 378 /** 379 * Constructs a tokenizer splitting using the specified delimiter matcher 380 * and handling quotes using the specified quote matcher. 381 * 382 * @param input the string which is to be parsed, not cloned 383 * @param delim the field delimiter character 384 * @param quote the field quoted string character 385 */ 386 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 387 this(input, delim); 388 setQuoteMatcher(quote); 389 } 390 391 // API 392 //----------------------------------------------------------------------- 393 /** 394 * Gets the number of tokens found in the String. 395 * 396 * @return the number of matched tokens 397 */ 398 public int size() { 399 checkTokenized(); 400 return tokens.length; 401 } 402 403 /** 404 * Gets the next token from the String. 405 * Equivalent to {@link #next()} except it returns null rather than 406 * throwing {@link NoSuchElementException} when no tokens remain. 407 * 408 * @return the next sequential token, or null when no more tokens are found 409 */ 410 public String nextToken() { 411 if (hasNext()) { 412 return tokens[tokenPos++]; 413 } 414 return null; 415 } 416 417 /** 418 * Gets the previous token from the String. 419 * 420 * @return the previous sequential token, or null when no more tokens are found 421 */ 422 public String previousToken() { 423 if (hasPrevious()) { 424 return tokens[--tokenPos]; 425 } 426 return null; 427 } 428 429 /** 430 * Gets a copy of the full token list as an independent modifiable array. 431 * 432 * @return the tokens as a String array 433 */ 434 public String[] getTokenArray() { 435 checkTokenized(); 436 return tokens.clone(); 437 } 438 439 /** 440 * Gets a copy of the full token list as an independent modifiable list. 441 * 442 * @return the tokens as a String array 443 */ 444 public List<String> getTokenList() { 445 checkTokenized(); 446 final List<String> list = new ArrayList<>(tokens.length); 447 Collections.addAll(list, tokens); 448 449 return list; 450 } 451 452 /** 453 * Resets this tokenizer, forgetting all parsing and iteration already completed. 454 * <p> 455 * This method allows the same tokenizer to be reused for the same String. 456 * 457 * @return this, to enable chaining 458 */ 459 public StrTokenizer reset() { 460 tokenPos = 0; 461 tokens = null; 462 return this; 463 } 464 465 /** 466 * Reset this tokenizer, giving it a new input string to parse. 467 * In this manner you can re-use a tokenizer with the same settings 468 * on multiple input lines. 469 * 470 * @param input the new string to tokenize, null sets no text to parse 471 * @return this, to enable chaining 472 */ 473 public StrTokenizer reset(final String input) { 474 reset(); 475 if (input != null) { 476 this.chars = input.toCharArray(); 477 } else { 478 this.chars = null; 479 } 480 return this; 481 } 482 483 /** 484 * Reset this tokenizer, giving it a new input string to parse. 485 * In this manner you can re-use a tokenizer with the same settings 486 * on multiple input lines. 487 * 488 * @param input the new character array to tokenize, not cloned, null sets no text to parse 489 * @return this, to enable chaining 490 */ 491 public StrTokenizer reset(final char[] input) { 492 reset(); 493 if (input != null) { 494 this.chars = input.clone(); 495 } else { 496 this.chars = null; 497 } 498 return this; 499 } 500 501 // ListIterator 502 //----------------------------------------------------------------------- 503 /** 504 * Checks whether there are any more tokens. 505 * 506 * @return true if there are more tokens 507 */ 508 @Override 509 public boolean hasNext() { 510 checkTokenized(); 511 return tokenPos < tokens.length; 512 } 513 514 /** 515 * Gets the next token. 516 * 517 * @return the next String token 518 * @throws NoSuchElementException if there are no more elements 519 */ 520 @Override 521 public String next() { 522 if (hasNext()) { 523 return tokens[tokenPos++]; 524 } 525 throw new NoSuchElementException(); 526 } 527 528 /** 529 * Gets the index of the next token to return. 530 * 531 * @return the next token index 532 */ 533 @Override 534 public int nextIndex() { 535 return tokenPos; 536 } 537 538 /** 539 * Checks whether there are any previous tokens that can be iterated to. 540 * 541 * @return true if there are previous tokens 542 */ 543 @Override 544 public boolean hasPrevious() { 545 checkTokenized(); 546 return tokenPos > 0; 547 } 548 549 /** 550 * Gets the token previous to the last returned token. 551 * 552 * @return the previous token 553 */ 554 @Override 555 public String previous() { 556 if (hasPrevious()) { 557 return tokens[--tokenPos]; 558 } 559 throw new NoSuchElementException(); 560 } 561 562 /** 563 * Gets the index of the previous token. 564 * 565 * @return the previous token index 566 */ 567 @Override 568 public int previousIndex() { 569 return tokenPos - 1; 570 } 571 572 /** 573 * Unsupported ListIterator operation. 574 * 575 * @throws UnsupportedOperationException always 576 */ 577 @Override 578 public void remove() { 579 throw new UnsupportedOperationException("remove() is unsupported"); 580 } 581 582 /** 583 * Unsupported ListIterator operation. 584 * @param obj this parameter ignored. 585 * @throws UnsupportedOperationException always 586 */ 587 @Override 588 public void set(final String obj) { 589 throw new UnsupportedOperationException("set() is unsupported"); 590 } 591 592 /** 593 * Unsupported ListIterator operation. 594 * @param obj this parameter ignored. 595 * @throws UnsupportedOperationException always 596 */ 597 @Override 598 public void add(final String obj) { 599 throw new UnsupportedOperationException("add() is unsupported"); 600 } 601 602 // Implementation 603 //----------------------------------------------------------------------- 604 /** 605 * Checks if tokenization has been done, and if not then do it. 606 */ 607 private void checkTokenized() { 608 if (tokens == null) { 609 if (chars == null) { 610 // still call tokenize as subclass may do some work 611 final List<String> split = tokenize(null, 0, 0); 612 tokens = split.toArray(new String[split.size()]); 613 } else { 614 final List<String> split = tokenize(chars, 0, chars.length); 615 tokens = split.toArray(new String[split.size()]); 616 } 617 } 618 } 619 620 /** 621 * Internal method to performs the tokenization. 622 * <p> 623 * Most users of this class do not need to call this method. This method 624 * will be called automatically by other (public) methods when required. 625 * <p> 626 * This method exists to allow subclasses to add code before or after the 627 * tokenization. For example, a subclass could alter the character array, 628 * offset or count to be parsed, or call the tokenizer multiple times on 629 * multiple strings. It is also be possible to filter the results. 630 * <p> 631 * <code>StrTokenizer</code> will always pass a zero offset and a count 632 * equal to the length of the array to this method, however a subclass 633 * may pass other values, or even an entirely different array. 634 * 635 * @param srcChars the character array being tokenized, may be null 636 * @param offset the start position within the character array, must be valid 637 * @param count the number of characters to tokenize, must be valid 638 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 639 */ 640 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 641 if (srcChars == null || count == 0) { 642 return Collections.emptyList(); 643 } 644 final StrBuilder buf = new StrBuilder(); 645 final List<String> tokenList = new ArrayList<>(); 646 int pos = offset; 647 648 // loop around the entire buffer 649 while (pos >= 0 && pos < count) { 650 // find next token 651 pos = readNextToken(srcChars, pos, count, buf, tokenList); 652 653 // handle case where end of string is a delimiter 654 if (pos >= count) { 655 addToken(tokenList, ""); 656 } 657 } 658 return tokenList; 659 } 660 661 /** 662 * Adds a token to a list, paying attention to the parameters we've set. 663 * 664 * @param list the list to add to 665 * @param tok the token to add 666 */ 667 private void addToken(final List<String> list, String tok) { 668 if (tok == null || tok.length() == 0) { 669 if (isIgnoreEmptyTokens()) { 670 return; 671 } 672 if (isEmptyTokenAsNull()) { 673 tok = null; 674 } 675 } 676 list.add(tok); 677 } 678 679 /** 680 * Reads character by character through the String to get the next token. 681 * 682 * @param srcChars the character array being tokenized 683 * @param start the first character of field 684 * @param len the length of the character array being tokenized 685 * @param workArea a temporary work area 686 * @param tokenList the list of parsed tokens 687 * @return the starting position of the next field (the character 688 * immediately after the delimiter), or -1 if end of string found 689 */ 690 private int readNextToken(final char[] srcChars, 691 int start, 692 final int len, 693 final StrBuilder workArea, 694 final List<String> tokenList) { 695 // skip all leading whitespace, unless it is the 696 // field delimiter or the quote character 697 while (start < len) { 698 final int removeLen = Math.max( 699 getIgnoredMatcher().isMatch(srcChars, start, start, len), 700 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 701 if (removeLen == 0 702 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 703 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 704 break; 705 } 706 start += removeLen; 707 } 708 709 // handle reaching end 710 if (start >= len) { 711 addToken(tokenList, ""); 712 return -1; 713 } 714 715 // handle empty token 716 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 717 if (delimLen > 0) { 718 addToken(tokenList, ""); 719 return start + delimLen; 720 } 721 722 // handle found token 723 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 724 if (quoteLen > 0) { 725 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 726 } 727 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 728 } 729 730 /** 731 * Reads a possibly quoted string token. 732 * 733 * @param srcChars the character array being tokenized 734 * @param start the first character of field 735 * @param len the length of the character array being tokenized 736 * @param workArea a temporary work area 737 * @param tokenList the list of parsed tokens 738 * @param quoteStart the start position of the matched quote, 0 if no quoting 739 * @param quoteLen the length of the matched quote, 0 if no quoting 740 * @return the starting position of the next field (the character 741 * immediately after the delimiter, or if end of string found, 742 * then the length of string 743 */ 744 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 745 final List<String> tokenList, final int quoteStart, final int quoteLen) { 746 // Loop until we've found the end of the quoted 747 // string or the end of the input 748 workArea.clear(); 749 int pos = start; 750 boolean quoting = quoteLen > 0; 751 int trimStart = 0; 752 753 while (pos < len) { 754 // quoting mode can occur several times throughout a string 755 // we must switch between quoting and non-quoting until we 756 // encounter a non-quoted delimiter, or end of string 757 if (quoting) { 758 // In quoting mode 759 760 // If we've found a quote character, see if it's 761 // followed by a second quote. If so, then we need 762 // to actually put the quote character into the token 763 // rather than end the token. 764 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 765 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 766 // matched pair of quotes, thus an escaped quote 767 workArea.append(srcChars, pos, quoteLen); 768 pos += quoteLen * 2; 769 trimStart = workArea.size(); 770 continue; 771 } 772 773 // end of quoting 774 quoting = false; 775 pos += quoteLen; 776 continue; 777 } 778 779 // copy regular character from inside quotes 780 workArea.append(srcChars[pos++]); 781 trimStart = workArea.size(); 782 783 } else { 784 // Not in quoting mode 785 786 // check for delimiter, and thus end of token 787 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 788 if (delimLen > 0) { 789 // return condition when end of token found 790 addToken(tokenList, workArea.substring(0, trimStart)); 791 return pos + delimLen; 792 } 793 794 // check for quote, and thus back into quoting mode 795 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 796 quoting = true; 797 pos += quoteLen; 798 continue; 799 } 800 801 // check for ignored (outside quotes), and ignore 802 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 803 if (ignoredLen > 0) { 804 pos += ignoredLen; 805 continue; 806 } 807 808 // check for trimmed character 809 // don't yet know if its at the end, so copy to workArea 810 // use trimStart to keep track of trim at the end 811 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 812 if (trimmedLen > 0) { 813 workArea.append(srcChars, pos, trimmedLen); 814 pos += trimmedLen; 815 continue; 816 } 817 818 // copy regular character from outside quotes 819 workArea.append(srcChars[pos++]); 820 trimStart = workArea.size(); 821 } 822 } 823 824 // return condition when end of string found 825 addToken(tokenList, workArea.substring(0, trimStart)); 826 return -1; 827 } 828 829 /** 830 * Checks if the characters at the index specified match the quote 831 * already matched in readNextToken(). 832 * 833 * @param srcChars the character array being tokenized 834 * @param pos the position to check for a quote 835 * @param len the length of the character array being tokenized 836 * @param quoteStart the start position of the matched quote, 0 if no quoting 837 * @param quoteLen the length of the matched quote, 0 if no quoting 838 * @return true if a quote is matched 839 */ 840 private boolean isQuote(final char[] srcChars, 841 final int pos, 842 final int len, 843 final int quoteStart, 844 final int quoteLen) { 845 for (int i = 0; i < quoteLen; i++) { 846 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 847 return false; 848 } 849 } 850 return true; 851 } 852 853 // Delimiter 854 //----------------------------------------------------------------------- 855 /** 856 * Gets the field delimiter matcher. 857 * 858 * @return the delimiter matcher in use 859 */ 860 public StrMatcher getDelimiterMatcher() { 861 return this.delimMatcher; 862 } 863 864 /** 865 * Sets the field delimiter matcher. 866 * <p> 867 * The delimiter is used to separate one token from another. 868 * 869 * @param delim the delimiter matcher to use 870 * @return this, to enable chaining 871 */ 872 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 873 if (delim == null) { 874 this.delimMatcher = StrMatcher.noneMatcher(); 875 } else { 876 this.delimMatcher = delim; 877 } 878 return this; 879 } 880 881 /** 882 * Sets the field delimiter character. 883 * 884 * @param delim the delimiter character to use 885 * @return this, to enable chaining 886 */ 887 public StrTokenizer setDelimiterChar(final char delim) { 888 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 889 } 890 891 /** 892 * Sets the field delimiter string. 893 * 894 * @param delim the delimiter string to use 895 * @return this, to enable chaining 896 */ 897 public StrTokenizer setDelimiterString(final String delim) { 898 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 899 } 900 901 // Quote 902 //----------------------------------------------------------------------- 903 /** 904 * Gets the quote matcher currently in use. 905 * <p> 906 * The quote character is used to wrap data between the tokens. 907 * This enables delimiters to be entered as data. 908 * The default value is '"' (double quote). 909 * 910 * @return the quote matcher in use 911 */ 912 public StrMatcher getQuoteMatcher() { 913 return quoteMatcher; 914 } 915 916 /** 917 * Set the quote matcher to use. 918 * <p> 919 * The quote character is used to wrap data between the tokens. 920 * This enables delimiters to be entered as data. 921 * 922 * @param quote the quote matcher to use, null ignored 923 * @return this, to enable chaining 924 */ 925 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 926 if (quote != null) { 927 this.quoteMatcher = quote; 928 } 929 return this; 930 } 931 932 /** 933 * Sets the quote character to use. 934 * <p> 935 * The quote character is used to wrap data between the tokens. 936 * This enables delimiters to be entered as data. 937 * 938 * @param quote the quote character to use 939 * @return this, to enable chaining 940 */ 941 public StrTokenizer setQuoteChar(final char quote) { 942 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 943 } 944 945 // Ignored 946 //----------------------------------------------------------------------- 947 /** 948 * Gets the ignored character matcher. 949 * <p> 950 * These characters are ignored when parsing the String, unless they are 951 * within a quoted region. 952 * The default value is not to ignore anything. 953 * 954 * @return the ignored matcher in use 955 */ 956 public StrMatcher getIgnoredMatcher() { 957 return ignoredMatcher; 958 } 959 960 /** 961 * Set the matcher for characters to ignore. 962 * <p> 963 * These characters are ignored when parsing the String, unless they are 964 * within a quoted region. 965 * 966 * @param ignored the ignored matcher to use, null ignored 967 * @return this, to enable chaining 968 */ 969 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 970 if (ignored != null) { 971 this.ignoredMatcher = ignored; 972 } 973 return this; 974 } 975 976 /** 977 * Set the character to ignore. 978 * <p> 979 * This character is ignored when parsing the String, unless it is 980 * within a quoted region. 981 * 982 * @param ignored the ignored character to use 983 * @return this, to enable chaining 984 */ 985 public StrTokenizer setIgnoredChar(final char ignored) { 986 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 987 } 988 989 // Trimmer 990 //----------------------------------------------------------------------- 991 /** 992 * Gets the trimmer character matcher. 993 * <p> 994 * These characters are trimmed off on each side of the delimiter 995 * until the token or quote is found. 996 * The default value is not to trim anything. 997 * 998 * @return the trimmer matcher in use 999 */ 1000 public StrMatcher getTrimmerMatcher() { 1001 return trimmerMatcher; 1002 } 1003 1004 /** 1005 * Sets the matcher for characters to trim. 1006 * <p> 1007 * These characters are trimmed off on each side of the delimiter 1008 * until the token or quote is found. 1009 * 1010 * @param trimmer the trimmer matcher to use, null ignored 1011 * @return this, to enable chaining 1012 */ 1013 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1014 if (trimmer != null) { 1015 this.trimmerMatcher = trimmer; 1016 } 1017 return this; 1018 } 1019 1020 //----------------------------------------------------------------------- 1021 /** 1022 * Gets whether the tokenizer currently returns empty tokens as null. 1023 * The default for this property is false. 1024 * 1025 * @return true if empty tokens are returned as null 1026 */ 1027 public boolean isEmptyTokenAsNull() { 1028 return this.emptyAsNull; 1029 } 1030 1031 /** 1032 * Sets whether the tokenizer should return empty tokens as null. 1033 * The default for this property is false. 1034 * 1035 * @param emptyAsNull whether empty tokens are returned as null 1036 * @return this, to enable chaining 1037 */ 1038 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1039 this.emptyAsNull = emptyAsNull; 1040 return this; 1041 } 1042 1043 //----------------------------------------------------------------------- 1044 /** 1045 * Gets whether the tokenizer currently ignores empty tokens. 1046 * The default for this property is true. 1047 * 1048 * @return true if empty tokens are not returned 1049 */ 1050 public boolean isIgnoreEmptyTokens() { 1051 return ignoreEmptyTokens; 1052 } 1053 1054 /** 1055 * Sets whether the tokenizer should ignore and not return empty tokens. 1056 * The default for this property is true. 1057 * 1058 * @param ignoreEmptyTokens whether empty tokens are not returned 1059 * @return this, to enable chaining 1060 */ 1061 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1062 this.ignoreEmptyTokens = ignoreEmptyTokens; 1063 return this; 1064 } 1065 1066 //----------------------------------------------------------------------- 1067 /** 1068 * Gets the String content that the tokenizer is parsing. 1069 * 1070 * @return the string content being parsed 1071 */ 1072 public String getContent() { 1073 if (chars == null) { 1074 return null; 1075 } 1076 return new String(chars); 1077 } 1078 1079 //----------------------------------------------------------------------- 1080 /** 1081 * Creates a new instance of this Tokenizer. The new instance is reset so 1082 * that it will be at the start of the token list. 1083 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1084 * 1085 * @return a new instance of this Tokenizer which has been reset. 1086 */ 1087 @Override 1088 public Object clone() { 1089 try { 1090 return cloneReset(); 1091 } catch (final CloneNotSupportedException ex) { 1092 return null; 1093 } 1094 } 1095 1096 /** 1097 * Creates a new instance of this Tokenizer. The new instance is reset so that 1098 * it will be at the start of the token list. 1099 * 1100 * @return a new instance of this Tokenizer which has been reset. 1101 * @throws CloneNotSupportedException if there is a problem cloning 1102 */ 1103 Object cloneReset() throws CloneNotSupportedException { 1104 // this method exists to enable 100% test coverage 1105 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1106 if (cloned.chars != null) { 1107 cloned.chars = cloned.chars.clone(); 1108 } 1109 cloned.reset(); 1110 return cloned; 1111 } 1112 1113 //----------------------------------------------------------------------- 1114 /** 1115 * Gets the String content that the tokenizer is parsing. 1116 * 1117 * @return the string content being parsed 1118 */ 1119 @Override 1120 public String toString() { 1121 if (tokens == null) { 1122 return "StrTokenizer[not tokenized yet]"; 1123 } 1124 return "StrTokenizer" + getTokenList(); 1125 } 1126 1127}