chariter.h 24 KB


  1. /*
  2. ********************************************************************
  3. *
  4. * Copyright (C) 1997-2011, International Business Machines
  5. * Corporation and others. All Rights Reserved.
  6. *
  7. ********************************************************************
  8. */
  9. #ifndef CHARITER_H
  10. #define CHARITER_H
  11. #include "unicode/utypes.h"
  12. #include "unicode/uobject.h"
  13. #include "unicode/unistr.h"
  14. /**
  15. * \file
  16. * \brief C++ API: Character Iterator
  17. */
  18. U_NAMESPACE_BEGIN
  19. /**
  20. * Abstract class that defines an API for forward-only iteration
  21. * on text objects.
  22. * This is a minimal interface for iteration without random access
  23. * or backwards iteration. It is especially useful for wrapping
  24. * streams with converters into an object for collation or
  25. * normalization.
  26. *
  27. * <p>Characters can be accessed in two ways: as code units or as
  28. * code points.
  29. * Unicode code points are 21-bit integers and are the scalar values
  30. * of Unicode characters. ICU uses the type UChar32 for them.
  31. * Unicode code units are the storage units of a given
  32. * Unicode/UCS Transformation Format (a character encoding scheme).
  33. * With UTF-16, all code points can be represented with either one
  34. * or two code units ("surrogates").
  35. * String storage is typically based on code units, while properties
  36. * of characters are typically determined using code point values.
  37. * Some processes may be designed to work with sequences of code units,
  38. * or it may be known that all characters that are important to an
  39. * algorithm can be represented with single code units.
  40. * Other processes will need to use the code point access functions.</p>
  41. *
  42. * <p>ForwardCharacterIterator provides nextPostInc() to access
  43. * a code unit and advance an internal position into the text object,
  44. * similar to a <code>return text[position++]</code>.<br>
  45. * It provides next32PostInc() to access a code point and advance an internal
  46. * position.</p>
  47. *
  48. * <p>next32PostInc() assumes that the current position is that of
  49. * the beginning of a code point, i.e., of its first code unit.
  50. * After next32PostInc(), this will be true again.
  51. * In general, access to code units and code points in the same
  52. * iteration loop should not be mixed. In UTF-16, if the current position
  53. * is on a second code unit (Low Surrogate), then only that code unit
  54. * is returned even by next32PostInc().</p>
  55. *
  56. * <p>For iteration with either function, there are two ways to
  57. * check for the end of the iteration. When there are no more
  58. * characters in the text object:
  59. * <ul>
  60. * <li>The hasNext() function returns FALSE.</li>
  61. * <li>nextPostInc() and next32PostInc() return DONE
  62. * when one attempts to read beyond the end of the text object.</li>
  63. * </ul>
  64. *
  65. * Example:
  66. * \code
  67. * void function1(ForwardCharacterIterator &it) {
  68. * UChar32 c;
  69. * while(it.hasNext()) {
  70. * c=it.next32PostInc();
  71. * // use c
  72. * }
  73. * }
  74. *
  75. * void function1(ForwardCharacterIterator &it) {
  76. * UChar c;
  77. * while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
  78. * // use c
  79. * }
  80. * }
  81. * \endcode
  82. * </p>
  83. *
  84. * @stable ICU 2.0
  85. */
  86. class U_COMMON_API ForwardCharacterIterator : public UObject {
  87. public:
  88. /**
  89. * Value returned by most of ForwardCharacterIterator's functions
  90. * when the iterator has reached the limits of its iteration.
  91. * @stable ICU 2.0
  92. */
  93. enum { DONE = 0xffff };
  94. /**
  95. * Destructor.
  96. * @stable ICU 2.0
  97. */
  98. virtual ~ForwardCharacterIterator();
  99. /**
  100. * Returns true when both iterators refer to the same
  101. * character in the same character-storage object.
  102. * @param that The ForwardCharacterIterator to be compared for equality
  103. * @return true when both iterators refer to the same
  104. * character in the same character-storage object
  105. * @stable ICU 2.0
  106. */
  107. virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;
  108. /**
  109. * Returns true when the iterators refer to different
  110. * text-storage objects, or to different characters in the
  111. * same text-storage object.
  112. * @param that The ForwardCharacterIterator to be compared for inequality
  113. * @return true when the iterators refer to different
  114. * text-storage objects, or to different characters in the
  115. * same text-storage object
  116. * @stable ICU 2.0
  117. */
  118. inline UBool operator!=(const ForwardCharacterIterator& that) const;
  119. /**
  120. * Generates a hash code for this iterator.
  121. * @return the hash code.
  122. * @stable ICU 2.0
  123. */
  124. virtual int32_t hashCode(void) const = 0;
  125. /**
  126. * Returns a UClassID for this ForwardCharacterIterator ("poor man's
  127. * RTTI").<P> Despite the fact that this function is public,
  128. * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
  129. * @return a UClassID for this ForwardCharacterIterator
  130. * @stable ICU 2.0
  131. */
  132. virtual UClassID getDynamicClassID(void) const = 0;
  133. /**
  134. * Gets the current code unit for returning and advances to the next code unit
  135. * in the iteration range
  136. * (toward endIndex()). If there are
  137. * no more code units to return, returns DONE.
  138. * @return the current code unit.
  139. * @stable ICU 2.0
  140. */
  141. virtual UChar nextPostInc(void) = 0;
  142. /**
  143. * Gets the current code point for returning and advances to the next code point
  144. * in the iteration range
  145. * (toward endIndex()). If there are
  146. * no more code points to return, returns DONE.
  147. * @return the current code point.
  148. * @stable ICU 2.0
  149. */
  150. virtual UChar32 next32PostInc(void) = 0;
  151. /**
  152. * Returns FALSE if there are no more code units or code points
  153. * at or after the current position in the iteration range.
  154. * This is used with nextPostInc() or next32PostInc() in forward
  155. * iteration.
  156. * @returns FALSE if there are no more code units or code points
  157. * at or after the current position in the iteration range.
  158. * @stable ICU 2.0
  159. */
  160. virtual UBool hasNext() = 0;
  161. protected:
  162. /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
  163. ForwardCharacterIterator();
  164. /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
  165. ForwardCharacterIterator(const ForwardCharacterIterator &other);
  166. /**
  167. * Assignment operator to be overridden in the implementing class.
  168. * @stable ICU 2.0
  169. */
  170. ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
  171. };
  172. /**
  173. * Abstract class that defines an API for iteration
  174. * on text objects.
  175. * This is an interface for forward and backward iteration
  176. * and random access into a text object.
  177. *
  178. * <p>The API provides backward compatibility to the Java and older ICU
  179. * CharacterIterator classes but extends them significantly:
  180. * <ol>
  181. * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
  182. * <li>While the old API functions provided forward iteration with
  183. * "pre-increment" semantics, the new one also provides functions
  184. * with "post-increment" semantics. They are more efficient and should
  185. * be the preferred iterator functions for new implementations.
  186. * The backward iteration always had "pre-decrement" semantics, which
  187. * are efficient.</li>
  188. * <li>Just like ForwardCharacterIterator, it provides access to
  189. * both code units and code points. Code point access versions are available
  190. * for the old and the new iteration semantics.</li>
  191. * <li>There are new functions for setting and moving the current position
  192. * without returning a character, for efficiency.</li>
  193. * </ol>
  194. *
  195. * See ForwardCharacterIterator for examples for using the new forward iteration
  196. * functions. For backward iteration, there is also a hasPrevious() function
  197. * that can be used analogously to hasNext().
  198. * The old functions work as before and are shown below.</p>
  199. *
  200. * <p>Examples for some of the new functions:</p>
  201. *
  202. * Forward iteration with hasNext():
  203. * \code
  204. * void forward1(CharacterIterator &it) {
  205. * UChar32 c;
  206. * for(it.setToStart(); it.hasNext();) {
  207. * c=it.next32PostInc();
  208. * // use c
  209. * }
  210. * }
  211. * \endcode
  212. * Forward iteration more similar to loops with the old forward iteration,
  213. * showing a way to convert simple for() loops:
  214. * \code
  215. * void forward2(CharacterIterator &it) {
  216. * UChar c;
  217. * for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
  218. * // use c
  219. * }
  220. * }
  221. * \endcode
  222. * Backward iteration with setToEnd() and hasPrevious():
  223. * \code
  224. * void backward1(CharacterIterator &it) {
  225. * UChar32 c;
  226. * for(it.setToEnd(); it.hasPrevious();) {
  227. * c=it.previous32();
  228. * // use c
  229. * }
  230. * }
  231. * \endcode
  232. * Backward iteration with a more traditional for() loop:
  233. * \code
  234. * void backward2(CharacterIterator &it) {
  235. * UChar c;
  236. * for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
  237. * // use c
  238. * }
  239. * }
  240. * \endcode
  241. *
  242. * Example for random access:
  243. * \code
  244. * void random(CharacterIterator &it) {
  245. * // set to the third code point from the beginning
  246. * it.move32(3, CharacterIterator::kStart);
  247. * // get a code point from here without moving the position
  248. * UChar32 c=it.current32();
  249. * // get the position
  250. * int32_t pos=it.getIndex();
  251. * // get the previous code unit
  252. * UChar u=it.previous();
  253. * // move back one more code unit
  254. * it.move(-1, CharacterIterator::kCurrent);
  255. * // set the position back to where it was
  256. * // and read the same code point c and move beyond it
  257. * it.setIndex(pos);
  258. * if(c!=it.next32PostInc()) {
  259. * exit(1); // CharacterIterator inconsistent
  260. * }
  261. * }
  262. * \endcode
  263. *
  264. * <p>Examples, especially for the old API:</p>
  265. *
  266. * Function processing characters, in this example simple output
  267. * <pre>
  268. * \code
  269. * void processChar( UChar c )
  270. * {
  271. * cout << " " << c;
  272. * }
  273. * \endcode
  274. * </pre>
  275. * Traverse the text from start to finish
  276. * <pre>
  277. * \code
  278. * void traverseForward(CharacterIterator& iter)
  279. * {
  280. * for(UChar c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
  281. * processChar(c);
  282. * }
  283. * }
  284. * \endcode
  285. * </pre>
  286. * Traverse the text backwards, from end to start
  287. * <pre>
  288. * \code
  289. * void traverseBackward(CharacterIterator& iter)
  290. * {
  291. * for(UChar c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
  292. * processChar(c);
  293. * }
  294. * }
  295. * \endcode
  296. * </pre>
  297. * Traverse both forward and backward from a given position in the text.
  298. * Calls to notBoundary() in this example represents some additional stopping criteria.
  299. * <pre>
  300. * \code
  301. * void traverseOut(CharacterIterator& iter, int32_t pos)
  302. * {
  303. * UChar c;
  304. * for (c = iter.setIndex(pos);
  305. * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
  306. * c = iter.next()) {}
  307. * int32_t end = iter.getIndex();
  308. * for (c = iter.setIndex(pos);
  309. * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
  310. * c = iter.previous()) {}
  311. * int32_t start = iter.getIndex() + 1;
  312. *
  313. * cout << "start: " << start << " end: " << end << endl;
  314. * for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
  315. * processChar(c);
  316. * }
  317. * }
  318. * \endcode
  319. * </pre>
  320. * Creating a StringCharacterIterator and calling the test functions
  321. * <pre>
  322. * \code
  323. * void CharacterIterator_Example( void )
  324. * {
  325. * cout << endl << "===== CharacterIterator_Example: =====" << endl;
  326. * UnicodeString text("Ein kleiner Satz.");
  327. * StringCharacterIterator iterator(text);
  328. * cout << "----- traverseForward: -----------" << endl;
  329. * traverseForward( iterator );
  330. * cout << endl << endl << "----- traverseBackward: ----------" << endl;
  331. * traverseBackward( iterator );
  332. * cout << endl << endl << "----- traverseOut: ---------------" << endl;
  333. * traverseOut( iterator, 7 );
  334. * cout << endl << endl << "-----" << endl;
  335. * }
  336. * \endcode
  337. * </pre>
  338. *
  339. * @stable ICU 2.0
  340. */
  341. class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
  342. public:
  343. /**
  344. * Origin enumeration for the move() and move32() functions.
  345. * @stable ICU 2.0
  346. */
  347. enum EOrigin { kStart, kCurrent, kEnd };
  348. /**
  349. * Destructor.
  350. * @stable ICU 2.0
  351. */
  352. virtual ~CharacterIterator();
  353. /**
  354. * Returns a pointer to a new CharacterIterator of the same
  355. * concrete class as this one, and referring to the same
  356. * character in the same text-storage object as this one. The
  357. * caller is responsible for deleting the new clone.
  358. * @return a pointer to a new CharacterIterator
  359. * @stable ICU 2.0
  360. */
  361. virtual CharacterIterator* clone(void) const = 0;
  362. /**
  363. * Sets the iterator to refer to the first code unit in its
  364. * iteration range, and returns that code unit.
  365. * This can be used to begin an iteration with next().
  366. * @return the first code unit in its iteration range.
  367. * @stable ICU 2.0
  368. */
  369. virtual UChar first(void) = 0;
  370. /**
  371. * Sets the iterator to refer to the first code unit in its
  372. * iteration range, returns that code unit, and moves the position
  373. * to the second code unit. This is an alternative to setToStart()
  374. * for forward iteration with nextPostInc().
  375. * @return the first code unit in its iteration range.
  376. * @stable ICU 2.0
  377. */
  378. virtual UChar firstPostInc(void);
  379. /**
  380. * Sets the iterator to refer to the first code point in its
  381. * iteration range, and returns that code unit,
  382. * This can be used to begin an iteration with next32().
  383. * Note that an iteration with next32PostInc(), beginning with,
  384. * e.g., setToStart() or firstPostInc(), is more efficient.
  385. * @return the first code point in its iteration range.
  386. * @stable ICU 2.0
  387. */
  388. virtual UChar32 first32(void) = 0;
  389. /**
  390. * Sets the iterator to refer to the first code point in its
  391. * iteration range, returns that code point, and moves the position
  392. * to the second code point. This is an alternative to setToStart()
  393. * for forward iteration with next32PostInc().
  394. * @return the first code point in its iteration range.
  395. * @stable ICU 2.0
  396. */
  397. virtual UChar32 first32PostInc(void);
  398. /**
  399. * Sets the iterator to refer to the first code unit or code point in its
  400. * iteration range. This can be used to begin a forward
  401. * iteration with nextPostInc() or next32PostInc().
  402. * @return the start position of the iteration range
  403. * @stable ICU 2.0
  404. */
  405. inline int32_t setToStart();
  406. /**
  407. * Sets the iterator to refer to the last code unit in its
  408. * iteration range, and returns that code unit.
  409. * This can be used to begin an iteration with previous().
  410. * @return the last code unit.
  411. * @stable ICU 2.0
  412. */
  413. virtual UChar last(void) = 0;
  414. /**
  415. * Sets the iterator to refer to the last code point in its
  416. * iteration range, and returns that code unit.
  417. * This can be used to begin an iteration with previous32().
  418. * @return the last code point.
  419. * @stable ICU 2.0
  420. */
  421. virtual UChar32 last32(void) = 0;
  422. /**
  423. * Sets the iterator to the end of its iteration range, just behind
  424. * the last code unit or code point. This can be used to begin a backward
  425. * iteration with previous() or previous32().
  426. * @return the end position of the iteration range
  427. * @stable ICU 2.0
  428. */
  429. inline int32_t setToEnd();
  430. /**
  431. * Sets the iterator to refer to the "position"-th code unit
  432. * in the text-storage object the iterator refers to, and
  433. * returns that code unit.
  434. * @param position the "position"-th code unit in the text-storage object
  435. * @return the "position"-th code unit.
  436. * @stable ICU 2.0
  437. */
  438. virtual UChar setIndex(int32_t position) = 0;
  439. /**
  440. * Sets the iterator to refer to the beginning of the code point
  441. * that contains the "position"-th code unit
  442. * in the text-storage object the iterator refers to, and
  443. * returns that code point.
  444. * The current position is adjusted to the beginning of the code point
  445. * (its first code unit).
  446. * @param position the "position"-th code unit in the text-storage object
  447. * @return the "position"-th code point.
  448. * @stable ICU 2.0
  449. */
  450. virtual UChar32 setIndex32(int32_t position) = 0;
  451. /**
  452. * Returns the code unit the iterator currently refers to.
  453. * @return the current code unit.
  454. * @stable ICU 2.0
  455. */
  456. virtual UChar current(void) const = 0;
  457. /**
  458. * Returns the code point the iterator currently refers to.
  459. * @return the current code point.
  460. * @stable ICU 2.0
  461. */
  462. virtual UChar32 current32(void) const = 0;
  463. /**
  464. * Advances to the next code unit in the iteration range
  465. * (toward endIndex()), and returns that code unit. If there are
  466. * no more code units to return, returns DONE.
  467. * @return the next code unit.
  468. * @stable ICU 2.0
  469. */
  470. virtual UChar next(void) = 0;
  471. /**
  472. * Advances to the next code point in the iteration range
  473. * (toward endIndex()), and returns that code point. If there are
  474. * no more code points to return, returns DONE.
  475. * Note that iteration with "pre-increment" semantics is less
  476. * efficient than iteration with "post-increment" semantics
  477. * that is provided by next32PostInc().
  478. * @return the next code point.
  479. * @stable ICU 2.0
  480. */
  481. virtual UChar32 next32(void) = 0;
  482. /**
  483. * Advances to the previous code unit in the iteration range
  484. * (toward startIndex()), and returns that code unit. If there are
  485. * no more code units to return, returns DONE.
  486. * @return the previous code unit.
  487. * @stable ICU 2.0
  488. */
  489. virtual UChar previous(void) = 0;
  490. /**
  491. * Advances to the previous code point in the iteration range
  492. * (toward startIndex()), and returns that code point. If there are
  493. * no more code points to return, returns DONE.
  494. * @return the previous code point.
  495. * @stable ICU 2.0
  496. */
  497. virtual UChar32 previous32(void) = 0;
  498. /**
  499. * Returns FALSE if there are no more code units or code points
  500. * before the current position in the iteration range.
  501. * This is used with previous() or previous32() in backward
  502. * iteration.
  503. * @return FALSE if there are no more code units or code points
  504. * before the current position in the iteration range, return TRUE otherwise.
  505. * @stable ICU 2.0
  506. */
  507. virtual UBool hasPrevious() = 0;
  508. /**
  509. * Returns the numeric index in the underlying text-storage
  510. * object of the character returned by first(). Since it's
  511. * possible to create an iterator that iterates across only
  512. * part of a text-storage object, this number isn't
  513. * necessarily 0.
  514. * @returns the numeric index in the underlying text-storage
  515. * object of the character returned by first().
  516. * @stable ICU 2.0
  517. */
  518. inline int32_t startIndex(void) const;
  519. /**
  520. * Returns the numeric index in the underlying text-storage
  521. * object of the position immediately BEYOND the character
  522. * returned by last().
  523. * @return the numeric index in the underlying text-storage
  524. * object of the position immediately BEYOND the character
  525. * returned by last().
  526. * @stable ICU 2.0
  527. */
  528. inline int32_t endIndex(void) const;
  529. /**
  530. * Returns the numeric index in the underlying text-storage
  531. * object of the character the iterator currently refers to
  532. * (i.e., the character returned by current()).
  533. * @return the numberic index in the text-storage object of
  534. * the character the iterator currently refers to
  535. * @stable ICU 2.0
  536. */
  537. inline int32_t getIndex(void) const;
  538. /**
  539. * Returns the length of the entire text in the underlying
  540. * text-storage object.
  541. * @return the length of the entire text in the text-storage object
  542. * @stable ICU 2.0
  543. */
  544. inline int32_t getLength() const;
  545. /**
  546. * Moves the current position relative to the start or end of the
  547. * iteration range, or relative to the current position itself.
  548. * The movement is expressed in numbers of code units forward
  549. * or backward by specifying a positive or negative delta.
  550. * @param delta the position relative to origin. A positive delta means forward;
  551. * a negative delta means backward.
  552. * @param origin Origin enumeration {kStart, kCurrent, kEnd}
  553. * @return the new position
  554. * @stable ICU 2.0
  555. */
  556. virtual int32_t move(int32_t delta, EOrigin origin) = 0;
  557. /**
  558. * Moves the current position relative to the start or end of the
  559. * iteration range, or relative to the current position itself.
  560. * The movement is expressed in numbers of code points forward
  561. * or backward by specifying a positive or negative delta.
  562. * @param delta the position relative to origin. A positive delta means forward;
  563. * a negative delta means backward.
  564. * @param origin Origin enumeration {kStart, kCurrent, kEnd}
  565. * @return the new position
  566. * @stable ICU 2.0
  567. */
  568. virtual int32_t move32(int32_t delta, EOrigin origin) = 0;
  569. /**
  570. * Copies the text under iteration into the UnicodeString
  571. * referred to by "result".
  572. * @param result Receives a copy of the text under iteration.
  573. * @stable ICU 2.0
  574. */
  575. virtual void getText(UnicodeString& result) = 0;
  576. protected:
  577. /**
  578. * Empty constructor.
  579. * @stable ICU 2.0
  580. */
  581. CharacterIterator();
  582. /**
  583. * Constructor, just setting the length field in this base class.
  584. * @stable ICU 2.0
  585. */
  586. CharacterIterator(int32_t length);
  587. /**
  588. * Constructor, just setting the length and position fields in this base class.
  589. * @stable ICU 2.0
  590. */
  591. CharacterIterator(int32_t length, int32_t position);
  592. /**
  593. * Constructor, just setting the length, start, end, and position fields in this base class.
  594. * @stable ICU 2.0
  595. */
  596. CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
  597. /**
  598. * Copy constructor.
  599. *
  600. * @param that The CharacterIterator to be copied
  601. * @stable ICU 2.0
  602. */
  603. CharacterIterator(const CharacterIterator &that);
  604. /**
  605. * Assignment operator. Sets this CharacterIterator to have the same behavior,
  606. * as the one passed in.
  607. * @param that The CharacterIterator passed in.
  608. * @return the newly set CharacterIterator.
  609. * @stable ICU 2.0
  610. */
  611. CharacterIterator &operator=(const CharacterIterator &that);
  612. /**
  613. * Base class text length field.
  614. * Necessary this for correct getText() and hashCode().
  615. * @stable ICU 2.0
  616. */
  617. int32_t textLength;
  618. /**
  619. * Base class field for the current position.
  620. * @stable ICU 2.0
  621. */
  622. int32_t pos;
  623. /**
  624. * Base class field for the start of the iteration range.
  625. * @stable ICU 2.0
  626. */
  627. int32_t begin;
  628. /**
  629. * Base class field for the end of the iteration range.
  630. * @stable ICU 2.0
  631. */
  632. int32_t end;
  633. };
  634. inline UBool
  635. ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
  636. return !operator==(that);
  637. }
  638. inline int32_t
  639. CharacterIterator::setToStart() {
  640. return move(0, kStart);
  641. }
  642. inline int32_t
  643. CharacterIterator::setToEnd() {
  644. return move(0, kEnd);
  645. }
  646. inline int32_t
  647. CharacterIterator::startIndex(void) const {
  648. return begin;
  649. }
  650. inline int32_t
  651. CharacterIterator::endIndex(void) const {
  652. return end;
  653. }
  654. inline int32_t
  655. CharacterIterator::getIndex(void) const {
  656. return pos;
  657. }
  658. inline int32_t
  659. CharacterIterator::getLength(void) const {
  660. return textLength;
  661. }
  662. U_NAMESPACE_END
  663. #endif