search.h 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575
  1. /*
  2. **********************************************************************
  3. * Copyright (C) 2001-2011 IBM and others. All rights reserved.
  4. **********************************************************************
  5. * Date Name Description
  6. * 03/22/2000 helena Creation.
  7. **********************************************************************
  8. */
  9. #ifndef SEARCH_H
  10. #define SEARCH_H
  11. #include "unicode/utypes.h"
  12. /**
  13. * \file
  14. * \brief C++ API: SearchIterator object.
  15. */
  16. #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
  17. #include "unicode/uobject.h"
  18. #include "unicode/unistr.h"
  19. #include "unicode/chariter.h"
  20. #include "unicode/brkiter.h"
  21. #include "unicode/usearch.h"
  22. /**
  23. * @stable ICU 2.0
  24. */
  25. struct USearch;
  26. /**
  27. * @stable ICU 2.0
  28. */
  29. typedef struct USearch USearch;
  30. U_NAMESPACE_BEGIN
  31. /**
  32. *
  33. * <tt>SearchIterator</tt> is an abstract base class that provides
  34. * methods to search for a pattern within a text string. Instances of
  35. * <tt>SearchIterator</tt> maintain a current position and scans over the
  36. * target text, returning the indices the pattern is matched and the length
  37. * of each match.
  38. * <p>
  39. * <tt>SearchIterator</tt> defines a protocol for text searching.
  40. * Subclasses provide concrete implementations of various search algorithms.
  41. * For example, <tt>StringSearch</tt> implements language-sensitive pattern
  42. * matching based on the comparison rules defined in a
  43. * <tt>RuleBasedCollator</tt> object.
  44. * <p>
  45. * Other options for searching includes using a BreakIterator to restrict
  46. * the points at which matches are detected.
  47. * <p>
  48. * <tt>SearchIterator</tt> provides an API that is similar to that of
  49. * other text iteration classes such as <tt>BreakIterator</tt>. Using
  50. * this class, it is easy to scan through text looking for all occurances of
  51. * a given pattern. The following example uses a <tt>StringSearch</tt>
  52. * object to find all instances of "fox" in the target string. Any other
  53. * subclass of <tt>SearchIterator</tt> can be used in an identical
  54. * manner.
  55. * <pre><code>
  56. * UnicodeString target("The quick brown fox jumped over the lazy fox");
  57. * UnicodeString pattern("fox");
  58. *
  59. * SearchIterator *iter = new StringSearch(pattern, target);
  60. * UErrorCode error = U_ZERO_ERROR;
  61. * for (int pos = iter->first(error); pos != USEARCH_DONE;
  62. * pos = iter->next(error)) {
  63. * printf("Found match at %d pos, length is %d\n", pos,
  64. * iter.getMatchLength());
  65. * }
  66. * </code></pre>
  67. *
  68. * @see StringSearch
  69. * @see RuleBasedCollator
  70. */
  71. class U_I18N_API SearchIterator : public UObject {
  72. public:
  73. // public constructors and destructors -------------------------------
  74. /**
  75. * Copy constructor that creates a SearchIterator instance with the same
  76. * behavior, and iterating over the same text.
  77. * @param other the SearchIterator instance to be copied.
  78. * @stable ICU 2.0
  79. */
  80. SearchIterator(const SearchIterator &other);
  81. /**
  82. * Destructor. Cleans up the search iterator data struct.
  83. * @stable ICU 2.0
  84. */
  85. virtual ~SearchIterator();
  86. // public get and set methods ----------------------------------------
  87. /**
  88. * Sets the index to point to the given position, and clears any state
  89. * that's affected.
  90. * <p>
  91. * This method takes the argument index and sets the position in the text
  92. * string accordingly without checking if the index is pointing to a
  93. * valid starting point to begin searching.
  94. * @param position within the text to be set. If position is less
  95. * than or greater than the text range for searching,
  96. * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
  97. * @param status for errors if it occurs
  98. * @stable ICU 2.0
  99. */
  100. virtual void setOffset(int32_t position, UErrorCode &status) = 0;
  101. /**
  102. * Return the current index in the text being searched.
  103. * If the iteration has gone past the end of the text
  104. * (or past the beginning for a backwards search), USEARCH_DONE
  105. * is returned.
  106. * @return current index in the text being searched.
  107. * @stable ICU 2.0
  108. */
  109. virtual int32_t getOffset(void) const = 0;
  110. /**
  111. * Sets the text searching attributes located in the enum
  112. * USearchAttribute with values from the enum USearchAttributeValue.
  113. * USEARCH_DEFAULT can be used for all attributes for resetting.
  114. * @param attribute text attribute (enum USearchAttribute) to be set
  115. * @param value text attribute value
  116. * @param status for errors if it occurs
  117. * @stable ICU 2.0
  118. */
  119. void setAttribute(USearchAttribute attribute,
  120. USearchAttributeValue value,
  121. UErrorCode &status);
  122. /**
  123. * Gets the text searching attributes
  124. * @param attribute text attribute (enum USearchAttribute) to be retrieve
  125. * @return text attribute value
  126. * @stable ICU 2.0
  127. */
  128. USearchAttributeValue getAttribute(USearchAttribute attribute) const;
  129. /**
  130. * Returns the index to the match in the text string that was searched.
  131. * This call returns a valid result only after a successful call to
  132. * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
  133. * Just after construction, or after a searching method returns
  134. * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
  135. * <p>
  136. * Use getMatchedLength to get the matched string length.
  137. * @return index of a substring within the text string that is being
  138. * searched.
  139. * @see #first
  140. * @see #next
  141. * @see #previous
  142. * @see #last
  143. * @stable ICU 2.0
  144. */
  145. int32_t getMatchedStart(void) const;
  146. /**
  147. * Returns the length of text in the string which matches the search
  148. * pattern. This call returns a valid result only after a successful call
  149. * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
  150. * Just after construction, or after a searching method returns
  151. * <tt>USEARCH_DONE</tt>, this method will return 0.
  152. * @return The length of the match in the target text, or 0 if there
  153. * is no match currently.
  154. * @see #first
  155. * @see #next
  156. * @see #previous
  157. * @see #last
  158. * @stable ICU 2.0
  159. */
  160. int32_t getMatchedLength(void) const;
  161. /**
  162. * Returns the text that was matched by the most recent call to
  163. * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
  164. * If the iterator is not pointing at a valid match (e.g. just after
  165. * construction or after <tt>USEARCH_DONE</tt> has been returned,
  166. * returns an empty string.
  167. * @param result stores the matched string or an empty string if a match
  168. * is not found.
  169. * @see #first
  170. * @see #next
  171. * @see #previous
  172. * @see #last
  173. * @stable ICU 2.0
  174. */
  175. void getMatchedText(UnicodeString &result) const;
  176. /**
  177. * Set the BreakIterator that will be used to restrict the points
  178. * at which matches are detected. The user is responsible for deleting
  179. * the breakiterator.
  180. * @param breakiter A BreakIterator that will be used to restrict the
  181. * points at which matches are detected. If a match is
  182. * found, but the match's start or end index is not a
  183. * boundary as determined by the <tt>BreakIterator</tt>,
  184. * the match will be rejected and another will be searched
  185. * for. If this parameter is <tt>NULL</tt>, no break
  186. * detection is attempted.
  187. * @param status for errors if it occurs
  188. * @see BreakIterator
  189. * @stable ICU 2.0
  190. */
  191. void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
  192. /**
  193. * Returns the BreakIterator that is used to restrict the points at
  194. * which matches are detected. This will be the same object that was
  195. * passed to the constructor or to <tt>setBreakIterator</tt>.
  196. * Note that <tt>NULL</tt> is a legal value; it means that break
  197. * detection should not be attempted.
  198. * @return BreakIterator used to restrict matchings.
  199. * @see #setBreakIterator
  200. * @stable ICU 2.0
  201. */
  202. const BreakIterator * getBreakIterator(void) const;
  203. /**
  204. * Set the string text to be searched. Text iteration will hence begin at
  205. * the start of the text string. This method is useful if you want to
  206. * re-use an iterator to search for the same pattern within a different
  207. * body of text. The user is responsible for deleting the text.
  208. * @param text string to be searched.
  209. * @param status for errors. If the text length is 0,
  210. * an U_ILLEGAL_ARGUMENT_ERROR is returned.
  211. * @stable ICU 2.0
  212. */
  213. virtual void setText(const UnicodeString &text, UErrorCode &status);
  214. /**
  215. * Set the string text to be searched. Text iteration will hence begin at
  216. * the start of the text string. This method is useful if you want to
  217. * re-use an iterator to search for the same pattern within a different
  218. * body of text.
  219. * <p>
  220. * Note: No parsing of the text within the <tt>CharacterIterator</tt>
  221. * will be done during searching for this version. The block of text
  222. * in <tt>CharacterIterator</tt> will be used as it is.
  223. * The user is responsible for deleting the text.
  224. * @param text string iterator to be searched.
  225. * @param status for errors if any. If the text length is 0 then an
  226. * U_ILLEGAL_ARGUMENT_ERROR is returned.
  227. * @stable ICU 2.0
  228. */
  229. virtual void setText(CharacterIterator &text, UErrorCode &status);
  230. /**
  231. * Return the string text to be searched.
  232. * @return text string to be searched.
  233. * @stable ICU 2.0
  234. */
  235. const UnicodeString & getText(void) const;
  236. // operator overloading ----------------------------------------------
  237. /**
  238. * Equality operator.
  239. * @param that SearchIterator instance to be compared.
  240. * @return TRUE if both BreakIterators are of the same class, have the
  241. * same behavior, terates over the same text and have the same
  242. * attributes. FALSE otherwise.
  243. * @stable ICU 2.0
  244. */
  245. virtual UBool operator==(const SearchIterator &that) const;
  246. /**
  247. * Not-equal operator.
  248. * @param that SearchIterator instance to be compared.
  249. * @return FALSE if operator== returns TRUE, and vice versa.
  250. * @stable ICU 2.0
  251. */
  252. UBool operator!=(const SearchIterator &that) const;
  253. // public methods ----------------------------------------------------
  254. /**
  255. * Returns a copy of SearchIterator with the same behavior, and
  256. * iterating over the same text, as this one. Note that all data will be
  257. * replicated, except for the text string to be searched.
  258. * @return cloned object
  259. * @stable ICU 2.0
  260. */
  261. virtual SearchIterator* safeClone(void) const = 0;
  262. /**
  263. * Returns the first index at which the string text matches the search
  264. * pattern. The iterator is adjusted so that its current index (as
  265. * returned by <tt>getOffset</tt>) is the match position if one
  266. * was found.
  267. * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
  268. * the iterator will be adjusted to the index USEARCH_DONE
  269. * @param status for errors if it occurs
  270. * @return The character index of the first match, or
  271. * <tt>USEARCH_DONE</tt> if there are no matches.
  272. * @see #getOffset
  273. * @stable ICU 2.0
  274. */
  275. int32_t first(UErrorCode &status);
  276. /**
  277. * Returns the first index equal or greater than <tt>position</tt> at which the
  278. * string text matches the search pattern. The iterator is adjusted so
  279. * that its current index (as returned by <tt>getOffset</tt>) is the
  280. * match position if one was found.
  281. * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
  282. * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
  283. * @param position where search if to start from. If position is less
  284. * than or greater than the text range for searching,
  285. * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
  286. * @param status for errors if it occurs
  287. * @return The character index of the first match following
  288. * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
  289. * matches.
  290. * @see #getOffset
  291. * @stable ICU 2.0
  292. */
  293. int32_t following(int32_t position, UErrorCode &status);
  294. /**
  295. * Returns the last index in the target text at which it matches the
  296. * search pattern. The iterator is adjusted so that its current index
  297. * (as returned by <tt>getOffset</tt>) is the match position if one was
  298. * found.
  299. * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
  300. * the iterator will be adjusted to the index USEARCH_DONE.
  301. * @param status for errors if it occurs
  302. * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
  303. * there are no matches.
  304. * @see #getOffset
  305. * @stable ICU 2.0
  306. */
  307. int32_t last(UErrorCode &status);
  308. /**
  309. * Returns the first index less than <tt>position</tt> at which the string
  310. * text matches the search pattern. The iterator is adjusted so that its
  311. * current index (as returned by <tt>getOffset</tt>) is the match
  312. * position if one was found. If a match is not found,
  313. * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
  314. * adjusted to the index USEARCH_DONE
  315. * <p>
  316. * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
  317. * result match is always less than <tt>position</tt>.
  318. * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
  319. * <tt>position</tt>.
  320. *
  321. * @param position where search is to start from. If position is less
  322. * than or greater than the text range for searching,
  323. * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
  324. * @param status for errors if it occurs
  325. * @return The character index of the first match preceding
  326. * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
  327. * no matches.
  328. * @see #getOffset
  329. * @stable ICU 2.0
  330. */
  331. int32_t preceding(int32_t position, UErrorCode &status);
  332. /**
  333. * Returns the index of the next point at which the text matches the
  334. * search pattern, starting from the current position
  335. * The iterator is adjusted so that its current index (as returned by
  336. * <tt>getOffset</tt>) is the match position if one was found.
  337. * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
  338. * the iterator will be adjusted to a position after the end of the text
  339. * string.
  340. * @param status for errors if it occurs
  341. * @return The index of the next match after the current position,
  342. * or <tt>USEARCH_DONE</tt> if there are no more matches.
  343. * @see #getOffset
  344. * @stable ICU 2.0
  345. */
  346. int32_t next(UErrorCode &status);
  347. /**
  348. * Returns the index of the previous point at which the string text
  349. * matches the search pattern, starting at the current position.
  350. * The iterator is adjusted so that its current index (as returned by
  351. * <tt>getOffset</tt>) is the match position if one was found.
  352. * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
  353. * the iterator will be adjusted to the index USEARCH_DONE
  354. * @param status for errors if it occurs
  355. * @return The index of the previous match before the current position,
  356. * or <tt>USEARCH_DONE</tt> if there are no more matches.
  357. * @see #getOffset
  358. * @stable ICU 2.0
  359. */
  360. int32_t previous(UErrorCode &status);
  361. /**
  362. * Resets the iteration.
  363. * Search will begin at the start of the text string if a forward
  364. * iteration is initiated before a backwards iteration. Otherwise if a
  365. * backwards iteration is initiated before a forwards iteration, the
  366. * search will begin at the end of the text string.
  367. * @stable ICU 2.0
  368. */
  369. virtual void reset();
  370. protected:
  371. // protected data members ---------------------------------------------
  372. /**
  373. * C search data struct
  374. * @stable ICU 2.0
  375. */
  376. USearch *m_search_;
  377. /**
  378. * Break iterator.
  379. * Currently the C++ breakiterator does not have getRules etc to reproduce
  380. * another in C. Hence we keep the original around and do the verification
  381. * at the end of the match. The user is responsible for deleting this
  382. * break iterator.
  383. * @stable ICU 2.0
  384. */
  385. BreakIterator *m_breakiterator_;
  386. /**
  387. * Unicode string version of the search text
  388. * @stable ICU 2.0
  389. */
  390. UnicodeString m_text_;
  391. // protected constructors and destructors -----------------------------
  392. /**
  393. * Default constructor.
  394. * Initializes data to the default values.
  395. * @stable ICU 2.0
  396. */
  397. SearchIterator();
  398. /**
  399. * Constructor for use by subclasses.
  400. * @param text The target text to be searched.
  401. * @param breakiter A {@link BreakIterator} that is used to restrict the
  402. * points at which matches are detected. If
  403. * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
  404. * match, but the match's start or end index is not a
  405. * boundary as determined by the <tt>BreakIterator</tt>,
  406. * the match is rejected and <tt>handleNext</tt> or
  407. * <tt>handlePrev</tt> is called again. If this parameter
  408. * is <tt>NULL</tt>, no break detection is attempted.
  409. * @see #handleNext
  410. * @see #handlePrev
  411. * @stable ICU 2.0
  412. */
  413. SearchIterator(const UnicodeString &text,
  414. BreakIterator *breakiter = NULL);
  415. /**
  416. * Constructor for use by subclasses.
  417. * <p>
  418. * Note: No parsing of the text within the <tt>CharacterIterator</tt>
  419. * will be done during searching for this version. The block of text
  420. * in <tt>CharacterIterator</tt> will be used as it is.
  421. * @param text The target text to be searched.
  422. * @param breakiter A {@link BreakIterator} that is used to restrict the
  423. * points at which matches are detected. If
  424. * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
  425. * match, but the match's start or end index is not a
  426. * boundary as determined by the <tt>BreakIterator</tt>,
  427. * the match is rejected and <tt>handleNext</tt> or
  428. * <tt>handlePrev</tt> is called again. If this parameter
  429. * is <tt>NULL</tt>, no break detection is attempted.
  430. * @see #handleNext
  431. * @see #handlePrev
  432. * @stable ICU 2.0
  433. */
  434. SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
  435. // protected methods --------------------------------------------------
  436. /**
  437. * Assignment operator. Sets this iterator to have the same behavior,
  438. * and iterate over the same text, as the one passed in.
  439. * @param that instance to be copied.
  440. * @stable ICU 2.0
  441. */
  442. SearchIterator & operator=(const SearchIterator &that);
  443. /**
  444. * Abstract method which subclasses override to provide the mechanism
  445. * for finding the next match in the target text. This allows different
  446. * subclasses to provide different search algorithms.
  447. * <p>
  448. * If a match is found, the implementation should return the index at
  449. * which the match starts and should call
  450. * <tt>setMatchLength</tt> with the number of characters
  451. * in the target text that make up the match. If no match is found, the
  452. * method should return USEARCH_DONE.
  453. * <p>
  454. * @param position The index in the target text at which the search
  455. * should start.
  456. * @param status for error codes if it occurs.
  457. * @return index at which the match starts, else if match is not found
  458. * USEARCH_DONE is returned
  459. * @see #setMatchLength
  460. * @stable ICU 2.0
  461. */
  462. virtual int32_t handleNext(int32_t position, UErrorCode &status)
  463. = 0;
  464. /**
  465. * Abstract method which subclasses override to provide the mechanism for
  466. * finding the previous match in the target text. This allows different
  467. * subclasses to provide different search algorithms.
  468. * <p>
  469. * If a match is found, the implementation should return the index at
  470. * which the match starts and should call
  471. * <tt>setMatchLength</tt> with the number of characters
  472. * in the target text that make up the match. If no match is found, the
  473. * method should return USEARCH_DONE.
  474. * <p>
  475. * @param position The index in the target text at which the search
  476. * should start.
  477. * @param status for error codes if it occurs.
  478. * @return index at which the match starts, else if match is not found
  479. * USEARCH_DONE is returned
  480. * @see #setMatchLength
  481. * @stable ICU 2.0
  482. */
  483. virtual int32_t handlePrev(int32_t position, UErrorCode &status)
  484. = 0;
  485. /**
  486. * Sets the length of the currently matched string in the text string to
  487. * be searched.
  488. * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
  489. * methods should call this when they find a match in the target text.
  490. * @param length length of the matched text.
  491. * @see #handleNext
  492. * @see #handlePrev
  493. * @stable ICU 2.0
  494. */
  495. virtual void setMatchLength(int32_t length);
  496. /**
  497. * Sets the offset of the currently matched string in the text string to
  498. * be searched.
  499. * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
  500. * methods should call this when they find a match in the target text.
  501. * @param position start offset of the matched text.
  502. * @see #handleNext
  503. * @see #handlePrev
  504. * @stable ICU 2.0
  505. */
  506. virtual void setMatchStart(int32_t position);
  507. /**
  508. * sets match not found
  509. * @stable ICU 2.0
  510. */
  511. void setMatchNotFound();
  512. };
  513. inline UBool SearchIterator::operator!=(const SearchIterator &that) const
  514. {
  515. return !operator==(that);
  516. }
  517. U_NAMESPACE_END
  518. #endif /* #if !UCONFIG_NO_COLLATION */
  519. #endif