text boundaries



Hi,

This patch does grapheme, word, and sentence boundaries. Line breaks
are still scaring me, so for now they are still implemented in the
original lame way. Comments welcome.

Havoc

Index: ChangeLog
===================================================================
RCS file: /cvs/gnome/pango/ChangeLog,v
retrieving revision 1.170
diff -u -u -r1.170 ChangeLog
--- ChangeLog	2000/11/17 21:38:01	1.170
+++ ChangeLog	2000/11/19 17:46:14
@@ -1,3 +1,46 @@
+2000-11-18  Havoc Pennington  <hp pobox com>
+
+	* pango/pango-context.c (pango_itemize): use pango_item_new(),
+	assert that items added to the list are sane.
+
+	* pango/break.c (pango_break): Try for a real implementation of
+	the Unicode text boundary algorithms
+	
+	* pango/pango-layout.c (pango_layout_check_lines): Reimplement 
+	to honor the paragraph boundaries from pango_break()
+
+	* pango/pango-layout.c (process_item): use pango_item_split() here
+
+	* pango/pango-item.c (pango_item_split): New function to split an
+	item into two items
+
+	* modules/arabic/basic.c (basic_engine_break): Remove; default 
+	break implementation should work
+	
+	* modules/arabic/arabic.c (arabic_engine_break): Remove; default 
+	break implementation should work
+
+	* modules/hangul/hangul.c (hangul_engine_lang_new): Remove
+	unimplemented break routine.
+	
+	* modules/tamil/tamil.c (tamil_engine_break): Remove, use default
+	break
+
+	* modules/indic/gujarati.c (pango_indic_engine_break): Remove; use
+	default break
+
+	* modules/indic/devanagari.c (pango_indic_engine_break): Remove;
+	use default break
+
+	* modules/indic/bengali.c (pango_indic_engine_break): Remove; use
+	default break
+	
+	* modules/indic/gurmukhi.c (pango_indic_engine_break): Remove; use
+	default break
+
+	* modules/indic/myanmar.c (pango_engine_break): Remove; use
+	default break
+
 Fri Nov 17 16:12:34 2000  Owen Taylor  <otaylor redhat com>
 
 	* Released 0.13
Index: pango/break.c
===================================================================
RCS file: /cvs/gnome/pango/pango/break.c,v
retrieving revision 1.7
diff -u -u -r1.7 break.c
--- pango/break.c	2000/07/19 16:36:30	1.7
+++ pango/break.c	2000/11/19 17:46:14
@@ -22,6 +22,780 @@
 #include "pango.h"
 #include "pango-modules.h"
 
+#define LEADING_JAMO(wc) ((wc) >= 0x1100 && (wc) <= 0x115F)
+#define VOWEL_JAMO(wc)   ((wc) >= 0x1160 && (wc) <= 0x11A2)
+#define TRAILING_JAMO(wc)   ((wc) >= 0x11A8 && (wc) <= 0x11F9)
+#define JAMO(wc)         ((wc) >= 0x1100 && (wc) <= 0x11F9)
+/* FIXME add Thai, Tibetan, Myanmar, Khmer */
+#define VIRAMA_SCRIPT(wc)        ((wc) >= 0x0901 && (wc) <= 0x0DF4)
+#define VIRAMA(wc) ((wc) == 0x094D || \
+                    (wc) == 0x09CD || \
+                    (wc) == 0x0A4D || \
+                    (wc) == 0x0ACD || \
+                    (wc) == 0x0B4D || \
+                    (wc) == 0x0BCD || \
+                    (wc) == 0x0C4D || \
+                    (wc) == 0x0CCD || \
+                    (wc) == 0x0D4D || \
+                    (wc) == 0x0DCA || \
+                    (wc) == 0x0E3A || \
+                    (wc) == 0x0F84 || \
+                    (wc) == 0x1039 || \
+                    (wc) == 0x17D2)
+/* Types of Japanese characters */
+#define JAPANESE(wc) (FALSE)
+#define HIRAGANA(wc) (FALSE)
+#define KATAKANA(wc) (FALSE)
+#define HAN(wc)      (FALSE)
+
+/* p. 132-133 of Unicode spec table 5-6 will help understand this */
+typedef enum
+{
+  STATE_SENTENCE_OUTSIDE,
+  STATE_SENTENCE_BODY,
+  STATE_SENTENCE_TERM,
+  STATE_SENTENCE_POST_TERM_CLOSE,
+  STATE_SENTENCE_POST_TERM_SPACE,
+  STATE_SENTENCE_POST_TERM_SEP,
+  STATE_SENTENCE_DOT,
+  STATE_SENTENCE_POST_DOT_CLOSE,
+  STATE_SENTENCE_POST_DOT_SPACE,
+  STATE_SENTENCE_POST_DOT_OPEN,
+  /* never include line/para separators in a sentence for now */
+  /* This isn't in the spec, but I can't figure out why they'd include
+   * one line/para separator in lines ending with Term but not with
+   * period-terminated lines, so I'm doing it for the dot lines also
+   */
+  STATE_SENTENCE_POST_DOT_SEP
+} SentenceState;
+
+/* We call "123" and "foobar" words, but "123foo" is two words;
+ * the Unicode spec just calls "123" a non-word
+ */
+typedef enum
+{
+  WordNone,
+  WordLetters,
+  WordNumbers
+} WordType;
+
+static void
+default_break (const gchar   *text,
+               gint           length,
+               PangoAnalysis *analysis,
+               PangoLogAttr  *attrs)
+{
+  /* The rationale for all this is in section 5.15 of the Unicode 3.0 book */
+
+  /* This is a default break implementation that should work for nearly all
+   * languages. Language engines can override it optionally.
+   */
+
+  /* FIXME one cheesy optimization here would be to memset attrs to 0
+   * before we start, and then never assign FALSE to anything
+   */
+  
+  const gchar *next = text;
+  const gchar *end = text + length;
+  gint i = 0;
+  gunichar prev_wc;
+  gunichar next_wc;
+  GUnicodeType prev_type;
+  WordType current_word_type = WordNone;
+  gunichar last_word_letter = 0;
+  SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;
+  /* Tracks what will be the end of the sentence if a period is
+   * determined to actually be a sentence-ending period.
+   */
+  gint possible_sentence_end = -1;
+  /* possible sentence break before Open* after a period-ended sentence */
+  gint possible_sentence_boundary = -1;
+  
+  if (next == end)
+    return;
+
+  prev_type = (GUnicodeType) -1;
+  prev_wc = 0;
+
+  next_wc = g_utf8_get_char (next);
+
+  if (next_wc == (gunichar)-1)
+    {
+      g_warning ("Invalid Unicode character passed to pango_break()!");
+      return;
+    }
+
+  while (*next && next < end)
+    {
+      GUnicodeType type;
+      gunichar wc;
+
+      wc = next_wc;
+
+      next = g_utf8_next_char (next);
+
+      if (next >= end)
+        next_wc = 0;
+      else
+        next_wc = g_utf8_get_char (next);
+
+      if (next_wc == (gunichar)-1)
+        {
+          g_warning ("Invalid Unicode character passed to pango_break()!");
+          return;
+        }
+
+      type = g_unichar_type (wc);
+
+      /* Can't just use the type here since isspace() doesn't
+       * correspond to a Unicode character type
+       */
+      attrs[i].is_white = g_unichar_isspace (wc);
+
+      /* ---- Paragraph breaking ---- */
+
+      /* unlike line breaks, we don't put a break on the first char,
+       * unless the first char is a paragraph delimiter of course.
+       * is_paragraph_boundary has the same semantics as is_sentence_boundary
+       * in that it divides up the text into sections rather than
+       * finding regions of text. The first char in some text
+       * can never be a paragraph boundary.
+       */
+      
+      attrs[i].is_paragraph_boundary = FALSE;
+
+      if (prev_type == G_UNICODE_PARAGRAPH_SEPARATOR ||
+          prev_wc == '\n')
+        attrs[i].is_paragraph_boundary = TRUE;
+      else if (prev_wc == '\r')
+        {
+          /* don't break between \r and \n */
+          if (wc != '\n')
+            attrs[i].is_paragraph_boundary = TRUE;
+        }
+      
+      attrs[i].is_paragraph_delimiter = FALSE;
+      
+      if (type == G_UNICODE_PARAGRAPH_SEPARATOR ||
+          wc == '\n' ||
+          wc == '\r')
+        attrs[i].is_paragraph_delimiter = TRUE;
+      
+      /* ---- Line breaking (FIXME) ---- */
+
+      attrs[i].is_break = i == 0 || attrs[i-1].is_white || attrs[i].is_white;
+
+      /* ---- Cursor position breaks (Grapheme breaks) ---- */
+
+      if (wc == '\n')
+        {
+          /* Break before line feed unless prev char is a CR */
+
+          if (prev_wc != '\r')
+            attrs[i].is_cursor_position = TRUE;
+          else
+            attrs[i].is_cursor_position = FALSE;
+        }
+      else if (i == 0 ||
+               prev_type == G_UNICODE_CONTROL ||
+               prev_type == G_UNICODE_FORMAT)
+        {
+          /* Break at first position (must be special cased, or if the
+           * first char is say a combining mark there won't be a
+           * cursor position at the start, which seems wrong to me
+           * ???? - maybe it makes sense though, who knows)
+           */
+          /* break after all format or control characters */
+          attrs[i].is_cursor_position = TRUE;
+        }
+      else
+        {
+          switch (type)
+            {
+            case G_UNICODE_CONTROL:
+            case G_UNICODE_FORMAT:
+              /* Break before all format or control characters */
+              attrs[i].is_cursor_position = TRUE;
+              break;
+
+            case G_UNICODE_COMBINING_MARK:
+            case G_UNICODE_ENCLOSING_MARK:
+            case G_UNICODE_NON_SPACING_MARK:
+              /* Unicode spec includes "Combining marks plus Tibetan
+               * subjoined characters" as joining chars, but lists the
+               * Tibetan subjoined characters as combining marks, and
+               * g_unichar_type() returns NON_SPACING_MARK for the Tibetan
+               * subjoined characters. So who knows, beats me.
+               */
+
+              /* It's a joining character, break only if preceded by
+               * control or format; we already handled the case where
+               * it was preceded earlier, so here we know it wasn't,
+               * don't break
+               */
+              attrs[i].is_cursor_position = FALSE;
+              break;
+
+            case G_UNICODE_LOWERCASE_LETTER:
+            case G_UNICODE_MODIFIER_LETTER:
+            case G_UNICODE_OTHER_LETTER:
+            case G_UNICODE_TITLECASE_LETTER:
+            case G_UNICODE_UPPERCASE_LETTER:
+              if (JAMO (wc))
+                {
+                  /* Break before Jamo if they are in a broken sequence or
+                   * next to non-Jamo, otherwise don't
+                   */
+                  if (LEADING_JAMO (wc) &&
+                      !LEADING_JAMO (prev_wc))
+                    attrs[i].is_cursor_position = TRUE;
+                  else if (VOWEL_JAMO (wc) &&
+                           !LEADING_JAMO (prev_wc) &&
+                           !VOWEL_JAMO (prev_wc))
+                    attrs[i].is_cursor_position = TRUE;
+                  else if (TRAILING_JAMO (wc) &&
+                           !LEADING_JAMO (prev_wc) &&
+                           !VOWEL_JAMO (prev_wc) &&
+                           !TRAILING_JAMO (prev_wc))
+                    attrs[i].is_cursor_position = TRUE;
+                  else
+                    attrs[i].is_cursor_position = FALSE;
+                }
+              else
+                {
+                  /* Handle non-Jamo non-combining chars */
+
+                  /* Break if preceded by Jamo; don't break if a
+                   * letter is preceded by a virama; break in all
+                   * other cases. No need to check whether we're
+                   * preceded by Jamo explicitly, since a Jamo is not
+                   * a virama, we just break in all cases where we
+                   * aren't preceded by a virama. Don't fool with viramas
+                   * if we aren't part of a script that uses them.
+                   */
+
+                  if (VIRAMA_SCRIPT (wc))
+                    {
+                      /* Check whether we're preceded by a virama; this
+                       * could use some optimization.
+                       */
+                      if (VIRAMA (prev_wc))
+                        attrs[i].is_cursor_position = FALSE;
+                      else
+                        attrs[i].is_cursor_position = TRUE;
+                    }
+                  else
+                    {
+                      attrs[i].is_cursor_position = TRUE;
+                    }
+                }
+              break;
+
+            default:
+              /* Some weirdo char, just break here, why not */
+              attrs[i].is_cursor_position = TRUE;
+              break;
+            }
+        }
+
+      /* ---- Word breaks ---- */
+
+      /* default to not a word start/end */
+      attrs[i].is_word_start = FALSE;
+      attrs[i].is_word_end = FALSE; 
+
+      if (current_word_type != WordNone)
+        {
+          /* Check for a word end */
+          switch (type)
+            {
+            case G_UNICODE_COMBINING_MARK:
+            case G_UNICODE_ENCLOSING_MARK:
+            case G_UNICODE_NON_SPACING_MARK:
+              /* nothing, we just eat these up as part of the word */
+              break;
+
+            case G_UNICODE_LOWERCASE_LETTER:
+            case G_UNICODE_MODIFIER_LETTER:
+            case G_UNICODE_OTHER_LETTER:
+            case G_UNICODE_TITLECASE_LETTER:
+            case G_UNICODE_UPPERCASE_LETTER:
+              if (current_word_type == WordLetters)
+                {
+                  /* Japanese special cases for ending the word */
+                  if (JAPANESE (last_word_letter) ||
+                      JAPANESE (wc))
+                    {
+                      if ((HIRAGANA (last_word_letter) &&
+                           !HIRAGANA (wc)) ||
+                          (KATAKANA (last_word_letter) &&
+                           !(KATAKANA (wc) || HIRAGANA (wc))) ||
+                          (HAN (last_word_letter) &&
+                           !(HIRAGANA (wc) || HAN (wc))) ||
+                          (JAPANESE (last_word_letter) &&
+                           !JAPANESE (wc)) ||
+                          (!JAPANESE (last_word_letter) &&
+                           JAPANESE (wc)))
+                        attrs[i].is_word_end = TRUE;
+                    }
+                }
+              else
+                {
+                  /* end the number word, start the letter word */
+                  attrs[i].is_word_end = TRUE;
+                  attrs[i].is_word_start = TRUE;
+                  current_word_type = WordLetters;
+                }
+
+              last_word_letter = wc;
+              break;
+
+            case G_UNICODE_DECIMAL_NUMBER:
+            case G_UNICODE_LETTER_NUMBER:
+            case G_UNICODE_OTHER_NUMBER:
+              if (current_word_type != WordNumbers)
+                {
+                  attrs[i].is_word_end = TRUE;
+                  attrs[i].is_word_start = TRUE;
+                  current_word_type = WordNumbers;
+                }
+              
+              last_word_letter = wc;
+              break;
+              
+            default:
+              /* Punctuation, control/format chars, etc. all end a word. */
+              attrs[i].is_word_end = TRUE;
+              break;
+            }
+
+          if (attrs[i].is_word_end)
+            current_word_type = WordNone;
+        }
+      else
+        {
+          /* Check for a word start */
+          switch (type)
+            {
+            case G_UNICODE_LOWERCASE_LETTER:
+            case G_UNICODE_MODIFIER_LETTER:
+            case G_UNICODE_OTHER_LETTER:
+            case G_UNICODE_TITLECASE_LETTER:
+            case G_UNICODE_UPPERCASE_LETTER:
+              current_word_type = WordLetters;
+              last_word_letter = wc;
+              attrs[i].is_word_start = TRUE;
+              break;
+
+            case G_UNICODE_DECIMAL_NUMBER:
+            case G_UNICODE_LETTER_NUMBER:
+            case G_UNICODE_OTHER_NUMBER:
+              current_word_type = WordNumbers;
+              last_word_letter = wc;
+              attrs[i].is_word_start = TRUE;
+              break;
+              
+            default:
+              /* No word here */
+              break;
+            }
+        }
+      
+      /* ---- Sentence breaks ---- */
+
+      /* The Unicode spec specifies sentence breakpoints, so that a piece of
+       * text would be partitioned into sentences, and all characters would
+       * be inside some sentence. This code implements that for is_sentence_boundary,
+       * but tries to keep leading/trailing whitespace out of sentences for
+       * the start/end flags
+       */
+
+      /* The Unicode spec seems to say that one trailing line/para
+       * separator can be tacked on to a sentence ending in ! or ?,
+       * but not a sentence ending in period; I think they're on crack
+       * so am allowing one to be tacked onto a sentence ending in period.
+       */
+
+      /* No sentence break at the start of the text */
+      
+      /* default to not a sentence breakpoint */
+      attrs[i].is_sentence_boundary = FALSE;
+      attrs[i].is_sentence_start = FALSE;
+      attrs[i].is_sentence_end = FALSE;      
+
+      /* FIXME the Unicode spec lumps control/format chars with
+       * line/para separators in descriptive text, but not in the
+       * character class specs, in table 5-6, so who knows whether you
+       * are actually supposed to break on control/format
+       * characters. Seems semi-broken to break on tabs...
+       */
+
+      /* Break after line/para separators except carriage return
+       * followed by newline
+       */
+      switch (prev_type)
+        {
+        case G_UNICODE_LINE_SEPARATOR:
+        case G_UNICODE_PARAGRAPH_SEPARATOR:
+        case G_UNICODE_CONTROL:
+        case G_UNICODE_FORMAT:
+          if (wc == '\r')
+            {
+              if (next_wc != '\n')
+                attrs[i].is_sentence_boundary = TRUE;
+            }
+          else
+            attrs[i].is_sentence_boundary = TRUE;
+          break;
+
+        default:
+          break;
+        }
+
+      /* break before para/line separators except newline following
+       * carriage return
+       */
+      switch (type)
+        {
+        case G_UNICODE_LINE_SEPARATOR:
+        case G_UNICODE_PARAGRAPH_SEPARATOR:
+        case G_UNICODE_CONTROL:
+        case G_UNICODE_FORMAT:
+          if (wc == '\n')
+            {
+              if (prev_wc != '\r')
+                attrs[i].is_sentence_boundary = TRUE;
+            }
+          else
+            attrs[i].is_sentence_boundary = TRUE;
+          break;
+
+        default:
+          break;
+        }
+
+      switch (sentence_state)
+        {
+        case STATE_SENTENCE_OUTSIDE:
+          /* Start sentence if we have non-whitespace/format/control */
+          switch (type)
+            {
+            case G_UNICODE_LINE_SEPARATOR:
+            case G_UNICODE_PARAGRAPH_SEPARATOR:
+            case G_UNICODE_CONTROL:
+            case G_UNICODE_FORMAT:
+            case G_UNICODE_SPACE_SEPARATOR:
+              break;
+
+            default:
+              attrs[i].is_sentence_start = TRUE;
+              sentence_state = STATE_SENTENCE_BODY;
+              break;
+            }
+          break;
+
+        case STATE_SENTENCE_BODY:
+          /* If we already broke here due to separators, end the sentence. */
+          if (attrs[i].is_sentence_boundary)
+            {
+              attrs[i].is_sentence_end = TRUE;
+              sentence_state = STATE_SENTENCE_OUTSIDE;
+            }
+          else
+            {
+              if (wc == '.')
+                sentence_state = STATE_SENTENCE_DOT;
+              else if (wc == '?' || wc == '!')
+                sentence_state = STATE_SENTENCE_TERM;
+            }
+          break;
+
+        case STATE_SENTENCE_TERM:
+          /* End sentence on anything but close punctuation;
+           * follow Unicode rules for breaks
+           */
+          switch (type)
+            {
+            case G_UNICODE_CLOSE_PUNCTUATION:
+              sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;
+              break;
+
+            case G_UNICODE_SPACE_SEPARATOR:
+              attrs[i].is_sentence_end = TRUE;
+              sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
+              break;
+
+            case G_UNICODE_LINE_SEPARATOR:
+            case G_UNICODE_PARAGRAPH_SEPARATOR:
+              attrs[i].is_sentence_end = TRUE;
+              sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+              break;
+
+            default:
+              attrs[i].is_sentence_end = TRUE;
+              attrs[i].is_sentence_boundary = TRUE;
+              sentence_state = STATE_SENTENCE_OUTSIDE;
+              break;
+            }
+          break;
+
+        case STATE_SENTENCE_POST_TERM_CLOSE:
+          /* End sentence on anything besides more close punctuation; follow
+           * rules for breaks
+           */
+          switch (type)
+            {
+            case G_UNICODE_CLOSE_PUNCTUATION:
+              /* continue in this state */
+              break;
+
+            case G_UNICODE_SPACE_SEPARATOR:
+              attrs[i].is_sentence_end = TRUE;
+              sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
+              break;
+
+            case G_UNICODE_LINE_SEPARATOR:
+            case G_UNICODE_PARAGRAPH_SEPARATOR:
+              attrs[i].is_sentence_end = TRUE;
+              /* undo the unconditional break-at-all-line/para-separators
+               * from above; I'm not sure this is what the Unicode spec
+               * intends, but it seems right - we get to include
+               * a single line/para separator in the sentence according
+               * to their rules
+               */
+              attrs[i].is_sentence_boundary = FALSE;
+              sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+              break;
+
+            default:
+              attrs[i].is_sentence_end = TRUE;
+              attrs[i].is_sentence_boundary = TRUE;
+              sentence_state = STATE_SENTENCE_OUTSIDE;
+              break;
+            }
+          break;
+
+        case STATE_SENTENCE_POST_TERM_SPACE:
+
+          /* Sentence is definitely already ended; to enter this state
+           * we had to see a space, which ends the sentence.
+           */
+          
+          switch (type)
+            {
+            case G_UNICODE_SPACE_SEPARATOR:
+              /* continue in this state */
+              break;
+
+            case G_UNICODE_LINE_SEPARATOR:
+            case G_UNICODE_PARAGRAPH_SEPARATOR:
+              /* undo the unconditional break-at-all-line/para-separators
+               * from above; I'm not sure this is what the Unicode spec
+               * intends, but it seems right
+               */
+              attrs[i].is_sentence_boundary = FALSE;
+              sentence_state = STATE_SENTENCE_POST_TERM_SEP;
+              break;
+
+            default:
+              attrs[i].is_sentence_boundary = TRUE;
+              sentence_state = STATE_SENTENCE_OUTSIDE;
+              break;
+            }
+          break;
+
+        case STATE_SENTENCE_POST_TERM_SEP:
+          /* Break is forced at this point, unless we're a newline
+           * after a CR, then we will break after the newline on the
+           * next iteration. Only a single Sep can be in the
+           * sentence.
+           */
+          if (!(prev_wc == '\r' && wc == '\n'))
+            attrs[i].is_sentence_boundary = TRUE;
+          sentence_state = STATE_SENTENCE_OUTSIDE;
+          break;
+
+        case STATE_SENTENCE_DOT:          
+          switch (type)
+            {
+            case G_UNICODE_CLOSE_PUNCTUATION:
+              sentence_state = STATE_SENTENCE_POST_DOT_CLOSE;
+              break;
+
+            case G_UNICODE_SPACE_SEPARATOR:
+              possible_sentence_end = i;
+              sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
+              break;
+
+            default:
+              /* If we broke on a control/format char, end the
+               * sentence; else this was not a sentence end, since
+               * we didn't enter the POST_DOT_SPACE state.
+               */
+              if (attrs[i].is_sentence_boundary)
+                {
+                  attrs[i].is_sentence_end = TRUE;
+                  
+                  sentence_state = STATE_SENTENCE_OUTSIDE;
+                }
+              else
+                sentence_state = STATE_SENTENCE_BODY;
+              break;
+            }
+          break;
+
+        case STATE_SENTENCE_POST_DOT_CLOSE:
+          switch (type)
+            {
+            case G_UNICODE_SPACE_SEPARATOR:
+              possible_sentence_end = i;
+              sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
+              break;
+
+            default:
+              /* If we broke on a control/format char, end the
+               * sentence; else this was not a sentence end, since
+               * we didn't enter the POST_DOT_SPACE state.
+               */
+              if (attrs[i].is_sentence_boundary)
+                {
+                  attrs[i].is_sentence_end = TRUE;
+                  
+                  sentence_state = STATE_SENTENCE_OUTSIDE;
+                }
+              else
+                sentence_state = STATE_SENTENCE_BODY;
+              break;
+            }
+          break;
+
+        case STATE_SENTENCE_POST_DOT_SPACE:
+
+          possible_sentence_boundary = i;
+          
+          switch (type)
+            {
+            case G_UNICODE_SPACE_SEPARATOR:
+              /* remain in current state */
+              break;
+
+            case G_UNICODE_OPEN_PUNCTUATION:
+              sentence_state = STATE_SENTENCE_POST_DOT_OPEN;
+              break;
+
+            case G_UNICODE_LOWERCASE_LETTER:
+              /* wasn't a sentence-ending period; so re-enter the sentence
+               * body
+               */
+              sentence_state = STATE_SENTENCE_BODY;
+              break;
+              
+            default:
+              /* End the sentence, break, maybe start a new one */
+
+              g_assert (possible_sentence_end >= 0);
+              g_assert (possible_sentence_boundary >= 0);
+              
+              attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
+              attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+              possible_sentence_end = -1;
+              possible_sentence_boundary = -1;
+              
+              switch (type)
+                {
+                case G_UNICODE_LINE_SEPARATOR:
+                case G_UNICODE_PARAGRAPH_SEPARATOR:
+                case G_UNICODE_CONTROL:
+                case G_UNICODE_FORMAT:
+                  sentence_state = STATE_SENTENCE_OUTSIDE;
+                  break;
+                  
+                default:
+                  g_assert (type != G_UNICODE_SPACE_SEPARATOR);
+                  sentence_state = STATE_SENTENCE_BODY;
+                  attrs[i].is_sentence_start = TRUE;
+                  break;
+                }
+              break;
+            }
+          break;
+              
+        case STATE_SENTENCE_POST_DOT_OPEN:
+          switch (type)
+            {
+            case G_UNICODE_OPEN_PUNCTUATION:
+              /* continue in current state */
+              break;
+
+            case G_UNICODE_LOWERCASE_LETTER:
+              /* wasn't a sentence-ending period; so re-enter the sentence
+               * body
+               */
+              sentence_state = STATE_SENTENCE_BODY;
+              break;
+
+            default:
+              /* End the sentence, break, maybe start a new one */
+
+              g_assert (possible_sentence_end >= 0);
+              g_assert (possible_sentence_boundary >= 0);
+              
+              attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
+              attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+              possible_sentence_end = -1;
+              possible_sentence_boundary = -1;
+              
+              switch (type)
+                {
+                case G_UNICODE_LINE_SEPARATOR:
+                case G_UNICODE_PARAGRAPH_SEPARATOR:
+                case G_UNICODE_CONTROL:
+                case G_UNICODE_FORMAT:
+                  sentence_state = STATE_SENTENCE_OUTSIDE;
+                  break;
+                  
+                default:
+                  g_assert (type != G_UNICODE_SPACE_SEPARATOR);
+                  sentence_state = STATE_SENTENCE_BODY;
+                  attrs[i].is_sentence_start = TRUE;
+                  break;
+                }
+              break;
+            }
+          break;
+
+        case STATE_SENTENCE_POST_DOT_SEP:
+          /* Break is forced at this point, unless we're a newline
+           * after a CR, then we will break after the newline on the
+           * next iteration. Only a single Sep can be in the
+           * sentence.
+           */
+          if (!(prev_wc == '\r' && wc == '\n'))
+            attrs[i].is_sentence_boundary = TRUE;
+          sentence_state = STATE_SENTENCE_OUTSIDE;
+
+          g_assert (possible_sentence_end >= 0);
+          g_assert (possible_sentence_boundary >= 0);
+
+          attrs[possible_sentence_end].is_sentence_end = TRUE;
+
+          possible_sentence_end = -1;
+          possible_sentence_boundary = -1;
+          break;
+          
+        default:
+          g_assert_not_reached ();
+          break;
+        }
+      
+      prev_type = type;
+      prev_wc = wc;
+      ++i;
+    }
+}
+
 /**
  * pango_break:
  * @text:      the text to process
@@ -32,31 +806,17 @@
  * Determines possible line, word, and character breaks
  * for a string of Unicode text.
  */
-void pango_break (const gchar   *text, 
-		  gint           length, 
-		  PangoAnalysis *analysis, 
-		  PangoLogAttr  *attrs)
+void
+pango_break (const gchar   *text,
+             gint           length,
+             PangoAnalysis *analysis,
+             PangoLogAttr  *attrs)
 {
-  /* Pseudo-implementation */
-
-  const gchar *cur = text;
-  gint i = 0;
-  gunichar wc;
-  
-  while (*cur && cur - text < length)
-    {
-      wc = g_utf8_get_char (cur);
-      if (wc == (gunichar)-1)
-	break;			/* FIXME: ERROR */
-
-      attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == '\n' || wc == 0x200b) ? 1 : 0;
-      attrs[i].is_break = i == 0 || attrs[i-1].is_white || attrs[i].is_white;
-      attrs[i].is_char_stop = 1;
-      attrs[i].is_word_stop = ((i == 0) || attrs[i-1].is_white) && !attrs[i].is_white;
-      
-      i++;
-      cur = g_utf8_next_char (cur);
-    }
+  if (analysis->lang_engine &&
+      analysis->lang_engine->script_break)
+    (* analysis->lang_engine->script_break) (text, length, analysis, attrs);
+  else
+    default_break (text, length, analysis, attrs);
 }
 
 /**
@@ -85,17 +845,17 @@
   const char *range_start;
   int chars_in_range;
   static guint engine_type_id = 0;
-  static guint render_type_id = 0;  
+  static guint render_type_id = 0;
   PangoAnalysis analysis = { NULL, NULL, NULL, 0 };
 
   analysis.level = level;
-  
+
   g_return_if_fail (length == 0 || text != NULL);
   g_return_if_fail (log_attrs != NULL);
-  
+
   if (length == 0)
     return;
-  
+
   if (engine_type_id == 0)
     {
       engine_type_id = g_quark_from_static_string (PANGO_ENGINE_TYPE_LANG);
@@ -105,23 +865,23 @@
   n_chars = g_utf8_strlen (text, length);
 
   lang_map = pango_find_map (language, engine_type_id, render_type_id);
-    
+
   range_start = text;
   range_engine = (PangoEngineLang*) pango_map_get_engine (lang_map,
                                                           g_utf8_get_char (text));
   analysis.lang_engine = range_engine;
   chars_broken = 0;
   chars_in_range = 1;
-  
+
   end = text + length;
   pos = g_utf8_next_char (text);
-  
+
   while (pos != end)
     {
       analysis.lang_engine =
         (PangoEngineLang*) pango_map_get_engine (lang_map,
                                                  g_utf8_get_char (pos));
-      
+
       if (range_engine != analysis.lang_engine)
         {
           /* Engine has changed; do the breaking for the current range,
@@ -133,7 +893,7 @@
                        log_attrs + chars_broken);
 
           chars_broken += chars_in_range;
-          
+
           range_start = pos;
           range_engine = analysis.lang_engine;
           chars_in_range = 1;
@@ -142,15 +902,15 @@
         {
           chars_in_range += 1;
         }
-      
+
       pos = g_utf8_next_char (pos);
     }
-    
+
     g_assert (chars_in_range > 0);
     g_assert (range_start != end);
     g_assert (pos == end);
     g_assert (range_engine == analysis.lang_engine);
-    
+
     pango_break (range_start,
                  end - range_start,
                  &analysis,
Index: pango/pango-context.c
===================================================================
RCS file: /cvs/gnome/pango/pango/pango-context.c,v
retrieving revision 1.30
diff -u -u -r1.30 pango-context.c
--- pango/pango-context.c	2000/11/13 12:28:27	1.30
+++ pango/pango-context.c	2000/11/19 17:46:14
@@ -510,7 +510,9 @@
  * @cached_iter:      Cached attribute iterator, or NULL
  *
  * Breaks a piece of text into segments with consistent
- * directional level and shaping engine.
+ * directional level and shaping engine. Each byte of @text will
+ * be contained in exactly one of the items in the returned list;
+ * the generated list of items will be in order.
  *
  * @cached_iter should be an iterator over @attrs currently positioned at a
  * range before or containing @start_index; @cached_iter will be advanced to
@@ -603,7 +605,11 @@
 	  fonts[i] != fonts[i-1] ||
 	  extra_attr_lists[i] != extra_attr_lists[i-1])
 	{
-	  item = g_new (PangoItem, 1);
+          /* assert that previous item got at least one char */
+          g_assert (item == NULL || item->length > 0);
+          g_assert (item == NULL || item->num_chars > 0);
+          
+	  item = pango_item_new ();
 	  item->offset = p - text;
 	  item->num_chars = 0;
 	  item->analysis.level = embedding_levels[i];
Index: pango/pango-item.c
===================================================================
RCS file: /cvs/gnome/pango/pango/pango-item.c,v
retrieving revision 1.2
diff -u -u -r1.2 pango-item.c
--- pango/pango-item.c	2000/05/28 01:23:41	1.2
+++ pango/pango-item.c	2000/11/19 17:46:14
@@ -88,3 +88,48 @@
   g_free (item);
 }
 
+/**
+ * pango_item_split:
+ * @orig: a #PangoItem
+ * @split_index: byte index of position to split item, relative to the start of the item
+ * @split_offset: number of chars between start of @orig and @split_index
+ * 
+ * Modifies @orig to cover only the text after @split_index, and
+ * returns a new item that covers the text before @split_index that
+ * used to be in @orig. You can think of @split_index as the length of
+ * the returned item. @split_index may not be 0, and it may not be
+ * greater than or equal to the length of @orig (that is, there must
+ * be at least one byte assigned to each item, you can't create a
+ * zero-length item). @split_offset is the length of the first item in
+ * chars, and must be provided because the text used to generate the
+ * item isn't available, so pango_item_split() can't count the char
+ * length of the split items itself.
+ * 
+ * Return value: new item representing text before @split_index
+ **/
+PangoItem*
+pango_item_split (PangoItem  *orig,
+                  int         split_index,
+                  int         split_offset)
+{
+  PangoItem *new_item = pango_item_copy (orig);
+
+  g_return_val_if_fail (orig != NULL, NULL);
+  g_return_val_if_fail (orig->length > 0, NULL);
+  g_return_val_if_fail (split_index > 0, NULL);
+  g_return_val_if_fail (split_index < orig->length, NULL);
+  g_return_val_if_fail (split_offset > 0, NULL);
+  g_return_val_if_fail (split_offset < orig->num_chars, NULL);
+  
+  new_item->length = split_index;
+  new_item->num_chars = split_offset;
+  
+  orig->offset += split_index;
+  orig->length -= split_index;
+  orig->num_chars -= split_offset;
+
+  g_assert (new_item->length > 0);
+  g_assert (orig->length > 0);
+  
+  return new_item;
+}
Index: pango/pango-item.h
===================================================================
RCS file: /cvs/gnome/pango/pango/pango-item.h,v
retrieving revision 1.2
diff -u -u -r1.2 pango-item.h
--- pango/pango-item.h	2000/06/19 03:02:06	1.2
+++ pango/pango-item.h	2000/11/19 17:46:14
@@ -49,9 +49,12 @@
   PangoAnalysis analysis;
 };
 
-PangoItem *pango_item_new  (void);
-PangoItem *pango_item_copy (PangoItem *item);
-void       pango_item_free (PangoItem *item);
+PangoItem *pango_item_new   (void);
+PangoItem *pango_item_copy  (PangoItem  *item);
+void       pango_item_free  (PangoItem  *item);
+PangoItem *pango_item_split (PangoItem  *orig,
+                             int         split_index,
+                             int         split_offset);
 
 #ifdef __cplusplus
 }
Index: pango/pango-layout.c
===================================================================
RCS file: /cvs/gnome/pango/pango/pango-layout.c,v
retrieving revision 1.47
diff -u -u -r1.47 pango-layout.c
--- pango/pango-layout.c	2000/11/16 00:09:29	1.47
+++ pango/pango-layout.c	2000/11/19 17:46:14
@@ -2236,15 +2236,10 @@
 	  else
 	    {
 	      PangoItem *new_item = pango_item_copy (item);
-	      
+
 	      length = g_utf8_offset_to_pointer (text + item->offset, break_num_chars) - (text + item->offset);
-	      
-	      new_item->length = length;
-	      new_item->num_chars = break_num_chars;
-	      
-	      item->offset += length;
-	      item->length -= length;
-	      item->num_chars -= break_num_chars;
+
+              new_item = pango_item_split (item, length, break_num_chars);
 	      
 	      if (shape_set)
 		imposed_shape (item->num_chars, &shape_ink, &shape_logical, glyphs);
@@ -2374,9 +2369,9 @@
 }
 
 static void
-get_para_log_attrs (const char   *text,
-		    GList        *items,
-		    PangoLogAttr *log_attrs)
+get_items_log_attrs (const char   *text,
+                     GList        *items,
+                     PangoLogAttr *log_attrs)
 {
   int offset = 0;
   int index = 0;
@@ -2407,7 +2402,7 @@
 
 	  items = items->next;
 	}
-
+      
       pango_break (text + index, tmp_item.length, &tmp_item.analysis, log_attrs + offset);
 
       offset += tmp_item.num_chars;
@@ -2417,14 +2412,333 @@
     }
 }
 
+static int
+item_offset_to_index (PangoItem  *item,
+                      const char *text,
+                      int         offset)
+{
+  return g_utf8_offset_to_pointer (text + item->offset,
+                                   offset) - 
+    (text + item->offset);
+}
+
+static GList*
+get_up_to_boundary (GList       **items,
+                    const char   *text,
+                    int          *start_offset,
+                    PangoLogAttr *log_attrs,
+                    GList       **tail)
+{
+  GList *tmp_list;
+  GList *last_node = NULL;
+  int found_boundary = -1;
+  int last_item_start = -1;
+  int last_item_end = -1;
+  GList *retval;
+  PangoItem *last_item;
+  
+  g_assert (*items != NULL);
+  
+  tmp_list = *items;
+  while (tmp_list != NULL)
+    {
+      PangoItem *item = tmp_list->data;
+      int start = *start_offset;
+      int i = *start_offset;
+      int end = *start_offset + item->num_chars;
+
+      last_node = tmp_list;      
+      last_item_start = start;
+      last_item_end = end;
+
+      /* make sure we didn't corrupt any items */
+      g_assert (item->num_chars > 0);
+      
+      /* skip the first char in the item list, otherwise we'd
+       * never move past boundaries.
+       */
+      if (*items == tmp_list)
+        ++i;
+      
+      while (i < end)
+        {          
+          if (log_attrs[i].is_paragraph_boundary)
+            {
+              found_boundary = i;
+              goto found;
+            }
+          
+          ++i;
+        }
+
+      *start_offset = end;
+      
+      tmp_list = g_list_next (tmp_list);
+    }
+
+ found:
+
+  /* Chop list */
+  retval = *items;
+  
+  /* store remaining items */
+  *items = last_node->next;
+  if (*items)
+    (*items)->prev = NULL;
+
+  /* terminate retval */
+  last_node->next = NULL;
+  
+  if (found_boundary >= 0)
+    {
+      /* Item contained a paragraph boundary; we need to split it, and
+       * start_offset still refers to the start of the item so needs
+       * updating.
+       */
+      last_item = last_node->data;
+      
+      if (found_boundary == *start_offset)
+        {
+          /* boundary at start of item; remove item from the
+           * list to be returned.
+           */
+          GList *tmp_node;
+          tmp_node = last_node->prev;
+          retval = g_list_delete_link (retval, last_node);
+          last_node = tmp_node;
+          *items = g_list_prepend (*items, last_item);
+        }
+      else
+        {
+          /* Split item at boundary */
+          int byte_start;
+          PangoItem *first_half;
+          
+          /* assert that we're in the middle and not outside the item */
+          g_assert (found_boundary > last_item_start);
+          g_assert (found_boundary < last_item_end);
+      
+          byte_start = item_offset_to_index (last_item, text,
+                                             found_boundary - last_item_start);
+          
+          first_half = pango_item_split (last_item, byte_start,
+                                         found_boundary - last_item_start);
+
+          /* Put second half into *items to be used later */
+          *items = g_list_prepend (*items, last_item);
+
+          /* Put first half into tail of retval */
+          last_node->data = first_half;
+          
+          *start_offset = found_boundary;
+        }
+    }
+
+  *tail = last_node;
+  return retval;
+}
+
+/* Strip delimiter characters from the end of an item list */
 static void
+strip_delimiters (GList       **items,
+                  GList        *tail,
+                  const char   *text,
+                  int           end_offset,
+                  PangoLogAttr *log_attrs)
+{
+  GList *tmp_list;
+  int i;
+  int not_delimiter = -1;
+  int item_start;
+  PangoItem *item;
+  
+  tmp_list = tail;
+
+  i = end_offset;
+    
+  while (tmp_list != NULL)
+    {
+      item = tmp_list->data;
+      item_start = i - item->num_chars;
+
+      --i;
+      while (i >= item_start)
+        {          
+          if (!log_attrs[i].is_paragraph_delimiter)
+            {
+              not_delimiter = i;
+              goto found;
+            }
+          
+          --i;
+        }
+
+      tmp_list = g_list_previous (tmp_list);
+    }
+
+  /* If we get here, we never found a non-delimiter, so just blow away
+   * all the items in the list
+   */
+
+  g_list_foreach (*items, (GFunc)pango_item_free, NULL);
+  g_list_free (*items);
+  *items = NULL;
+
+  return;
+  
+ found:
+  {
+    /* Chop delimiters off the end of item, then nuke all
+     * remaining elements of the list
+     */    
+    int delim_len_chars = item_start + item->num_chars - not_delimiter - 1;
+    int byte_start;
+    GList *junk;
+
+    g_assert (not_delimiter >= item_start);
+    g_assert (delim_len_chars < item->num_chars);
+
+    if (delim_len_chars > 0)
+      {
+        item->num_chars -= delim_len_chars;
+        
+        byte_start = item_offset_to_index (item, text, item->num_chars);
+        
+        item->length = byte_start;
+        
+        g_assert (item->length > 0);
+      }
+    
+    junk = tmp_list->next;
+    tmp_list->next = NULL;
+    if (junk)
+      {
+        junk->prev = NULL;
+        g_list_foreach (junk, (GFunc)pango_item_free, NULL);
+        g_list_free (junk);
+      }
+  }
+}
+
+static void
+spew_item_list (GList      *items,
+                const char *desc,
+                const char *text,
+                PangoLogAttr *log_attrs)
+{
+  GList *tmp_list;
+
+  return;
+  
+  printf (">%s\n", desc);
+  
+  tmp_list = items;
+  while (tmp_list != NULL)
+    {
+      PangoItem *item = tmp_list->data;
+      char *str = g_strndup (text + item->offset, item->length);
+      int i;
+      int start_char;
+      char *p;
+      
+      start_char = g_utf8_strlen (text, item->offset);
+      
+      printf (" \t%d \t%d \t%s\n",
+              item->offset, item->length,
+              str);
+      printf (" \t  \t  \t");
+
+      i = start_char;
+      p = text + item->offset;
+      while (p != (text + item->offset + item->length))
+        {
+          if (log_attrs[i].is_paragraph_boundary)
+            printf ("%d", p - text);
+          else
+            printf ("0");
+          
+          p = g_utf8_next_char (p);
+          ++i;
+        }
+
+      printf ("\n");
+
+
+      printf (" \t  \t  \t");
+      
+      i = start_char;
+      p = text + item->offset;
+      while (p != (text + item->offset + item->length))
+        {
+          if (log_attrs[i].is_paragraph_delimiter)
+            printf ("%d", p - text);
+          else
+            printf ("0");
+          
+          p = g_utf8_next_char (p);
+          ++i;
+        }
+
+      printf ("\n");
+      
+      g_free (str);
+      
+      tmp_list = tmp_list->next;
+    }
+}
+
+/* Pull one paragraph worth of items off of the list of items; strip
+ * out paragraph separators; start_offset is an inout param that
+ * comes in as the char offset of the first item in the list,
+ * and goes out as the char offset of the first item in the modified
+ * list. *start_offset == layout->n_chars when we run out of items.
+ * get_para_items() can return NULL before that though if the input
+ * text has adjacent paragraph separators (empty paragraphs)
+ *
+ * This initially looks more complicated than it needs to be, but
+ * remember that you can have a bunch of empty paragraphs in series
+ * e.g. "\n\n\n" will have boundaries FALSE/TRUE/TRUE and delimiters
+ * TRUE/TRUE/TRUE, that's a nasty one, since the boundary of one
+ * paragraph is a delimiter for another. Also remember a block of
+ * delimiters such as \r\n can be split between two items.
+ * 
+ * Assumptions made about delimiters and boundaries:
+ *  - one or more delimiters occur in a contiguous series before
+ *    each boundary
+ *  - delimiters only occur before a boundary
+ *  - boundaries occur between paragraphs, so there is at least one
+ *    char on either side of a boundary, no boundary at the start of
+ *    the text
+ */
+static GList*
+get_para_items (GList       **items,
+                const char   *text,
+                int          *start_offset,
+                PangoLogAttr *log_attrs)
+{
+  GList *tail = NULL;
+  GList *retval;
+
+  spew_item_list (*items, "initial", text, log_attrs);
+  
+  retval = get_up_to_boundary (items, text, start_offset, log_attrs,
+                               &tail);
+
+  spew_item_list (retval, "this para", text, log_attrs);
+  spew_item_list (*items, "leftover", text, log_attrs);
+  
+  strip_delimiters (&retval, tail, text, *start_offset, log_attrs);
+
+  spew_item_list (retval, "stripped", text, log_attrs);
+  
+  return retval;
+}
+
+static void
 pango_layout_check_lines (PangoLayout *layout)
 {
-  const char *start;
-  gboolean done = FALSE;
-  int start_offset;
   PangoAttrList *attrs;
-  PangoAttrIterator *iter;
+  GList *all_items;
   
   if (layout->lines)
     return;
@@ -2460,66 +2774,52 @@
       pango_attr_list_insert_before (attrs, attr);
     }
 
-  iter = pango_attr_list_get_iterator (attrs);
+  all_items = pango_itemize (layout->context,
+                             layout->text,
+                             0,
+                             layout->length,
+                             attrs,
+                             NULL);
   
   layout->log_attrs = g_new (PangoLogAttr, layout->n_chars);
-  
-  start_offset = 0;
-  start = layout->text;
-  do
-    {
-      int para_chars = 0;
-      const char *end = start;
-      ParaBreakState state;
-  
-      while (end != layout->text + layout->length && *end != '\n')
-	{
-	  end = g_utf8_next_char (end);
-	  para_chars++;
-	}
-      
-      if (end == layout->text + layout->length)
-	done = TRUE;
 
-      state.items = pango_itemize (layout->context,
-				   layout->text,
-				   start - layout->text,
-				   end - start,
-				   attrs,
-				   iter);
+  get_items_log_attrs (layout->text, all_items, layout->log_attrs);
 
-      get_para_log_attrs (start, state.items, layout->log_attrs + start_offset);
+  if (all_items == NULL)
+    {
+      layout->lines = g_slist_prepend (layout->lines,
+                                       pango_layout_line_new (layout));
+    }
+  else
+    {
+      int start_offset = 0;
 
-      if (state.items)
-	{
-	  state.first_line = TRUE;
-	  state.start_offset = start_offset;
-	  state.text = start;
-	  
-	  while (state.items)
-	    process_line (layout, &state);
-	}
-      else
-	layout->lines = g_slist_prepend (layout->lines,
-					 pango_layout_line_new (layout));
+      while (start_offset != layout->n_chars)
+        {
+          ParaBreakState state;      
+          int this_para_start = start_offset;
       
-      start_offset += para_chars;
-
-      if (!done)
-	{
-	  /* Handle newline */
-	  layout->log_attrs[start_offset].is_break = TRUE;
-	  layout->log_attrs[start_offset].is_white = TRUE;
-	  layout->log_attrs[start_offset].is_char_stop = TRUE;
-	  layout->log_attrs[start_offset].is_word_stop = TRUE;
-	  start_offset += 1;
-
-	  start = end + 1;
-	}
+          state.items = get_para_items (&all_items,
+                                        layout->text,
+                                        &start_offset,
+                                        layout->log_attrs);
+
+          if (state.items)
+            {
+              PangoItem *first_item = state.items->data;
+          
+              state.first_line = TRUE;
+              state.start_offset = this_para_start;
+              state.text = layout->text + first_item->offset;
+	  
+              while (state.items)
+                process_line (layout, &state);
+            }
+          else
+            layout->lines = g_slist_prepend (layout->lines,
+                                             pango_layout_line_new (layout));
+        }
     }
-  while (!done);
-
-  pango_attr_iterator_destroy (iter);
   
   if (attrs != layout->attrs)
     pango_attr_list_unref (attrs);
Index: pango/pango.h
===================================================================
RCS file: /cvs/gnome/pango/pango/pango.h,v
retrieving revision 1.8
diff -u -u -r1.8 pango.h
--- pango/pango.h	2000/07/06 23:32:42	1.8
+++ pango/pango.h	2000/11/19 17:46:14
@@ -39,14 +39,44 @@
 #include <pango/pango-layout.h>
 #include <pango/pango-types.h>
 
-/* Logical attributes of a character
+/* Logical attributes of a character.
  */
 struct _PangoLogAttr
 {
-  guint is_break : 1;  /* Break in front of character */
-  guint is_white : 1;
-  guint is_char_stop : 1;
-  guint is_word_stop : 1;
+  guint is_break : 1;           /* Can break line in front of character */
+  guint is_white : 1;           /* Whitespace character */
+
+  /* cursor can appear in front of character (i.e. this is a grapheme
+   * boundary, or the first character in the text)
+   */
+  guint is_cursor_position : 1;
+
+  /* Boundary between paragraphs (paragraph ends in front of character) */
+  guint is_paragraph_boundary : 1;
+  /* Character is a paragraph delimiter, such as '\n' or '\r'; one or
+   * more of these will precede any paragraph boundary
+   */
+  guint is_paragraph_delimiter : 1;
+  
+  /* Note that in degenerate cases, you could have both start/end set on
+   * some text, most likely for sentences (e.g. no space after a period, so
+   * the next sentence starts right away)
+   */
+  
+  guint is_word_start : 1;      /* first character in a word */
+  guint is_word_end   : 1;      /* is first non-word char after a word */
+
+  /* There are two ways to divide sentences. The first assigns all
+   * intersentence whitespace/control/format chars to some sentence,
+   * so all chars are in some sentence; is_sentence_boundary denotes
+   * the boundaries there. The second way doesn't assign
+   * between-sentence spaces, etc. to any sentence, so
+   * is_sentence_start/is_sentence_end mark the boundaries of those
+   * sentences.
+   */
+  guint is_sentence_boundary : 1;
+  guint is_sentence_start : 1;  /* first character in a sentence */
+  guint is_sentence_end : 1;    /* first non-sentence char after a sentence */  
 };
 
 /* Determine information about cluster/word/line breaks in a string
Index: modules/arabic/arabic.c
===================================================================
RCS file: /cvs/gnome/pango/modules/arabic/arabic.c,v
retrieving revision 1.13
diff -u -u -r1.13 arabic.c
--- modules/arabic/arabic.c	2000/11/02 21:22:13	1.13
+++ modules/arabic/arabic.c	2000/11/19 17:46:14
@@ -50,36 +50,6 @@
  * Language script engine
  */
 
-static void 
-arabic_engine_break (const char    *text,
-                     int            len,
-                     PangoAnalysis *analysis,
-                     PangoLogAttr  *attrs)
-{
-    /* Most of the code comes from tamil_engine_break
-     */
-
-    const char *cur = text;
-    gint        i  = 0;
-    gunichar    wc;
-
-    while (*cur && cur - text < len)
-        {
-            wc = g_utf8_get_char (cur);
-            if (wc == (gunichar)-1)
-                break;           /* FIXME: ERROR */
-
-            attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == 'n') ? 1 : 0;
-            attrs[i].is_break = (i > 0 && attrs[i-1].is_white) || attrs[i].is_white;
-            attrs[i].is_char_stop = 1;
-            attrs[i].is_word_stop = (i == 0) || attrs[i-1].is_white;
-            /* actually, is_word_stop in not correct, but simple and good enough. */
-            
-            i++;
-            cur = g_utf8_next_char (cur);
-        }
-}
-
 static PangoEngine *
 arabic_engine_lang_new ()
 {
@@ -90,7 +60,7 @@
     result->engine.id = "ArabicScriptEngine";
     result->engine.type = PANGO_ENGINE_TYPE_LANG;
     result->engine.length = sizeof (result);
-    result->script_break = arabic_engine_break;
+    result->script_break = NULL;
 
     return (PangoEngine *)result;
 }
Index: modules/basic/basic.c
===================================================================
RCS file: /cvs/gnome/pango/modules/basic/basic.c,v
retrieving revision 1.16
diff -u -u -r1.16 basic.c
--- modules/basic/basic.c	2000/11/12 23:45:11	1.16
+++ modules/basic/basic.c	2000/11/19 17:46:15
@@ -97,14 +97,6 @@
  * Language script engine
  */
 
-static void 
-basic_engine_break (const char     *text,
-		    gint            len,
-		    PangoAnalysis  *analysis,
-		    PangoLogAttr   *attrs)
-{
-}
-
 static PangoEngine *
 basic_engine_lang_new ()
 {
@@ -115,7 +107,7 @@
   result->engine.id = "BasicScriptEngine";
   result->engine.type = PANGO_ENGINE_TYPE_LANG;
   result->engine.length = sizeof (result);
-  result->script_break = basic_engine_break;
+  result->script_break = NULL;
 
   return (PangoEngine *)result;
 }
Index: modules/hangul/hangul.c
===================================================================
RCS file: /cvs/gnome/pango/modules/hangul/hangul.c,v
retrieving revision 1.9
diff -u -u -r1.9 hangul.c
--- modules/hangul/hangul.c	2000/11/13 02:44:41	1.9
+++ modules/hangul/hangul.c	2000/11/19 17:46:15
@@ -56,15 +56,6 @@
  * Language script engine
  */
 
-static void 
-hangul_engine_break (const char    *text,
-		     int            len,
-		     PangoAnalysis *analysis,
-		     PangoLogAttr  *attrs)
-{
-  /* (FIXME) */
-}
-
 static PangoEngine *
 hangul_engine_lang_new ()
 {
@@ -75,7 +66,7 @@
   result->engine.id = "HangulScriptEngine";
   result->engine.type = PANGO_ENGINE_TYPE_LANG;
   result->engine.length = sizeof (result);
-  result->script_break = hangul_engine_break;
+  result->script_break = NULL;
 
   return (PangoEngine *) result;
 }
Index: modules/indic/bengali.c
===================================================================
RCS file: /cvs/gnome/pango/modules/indic/bengali.c,v
retrieving revision 1.1
diff -u -u -r1.1 bengali.c
--- modules/indic/bengali.c	2000/11/08 06:07:46	1.1
+++ modules/indic/bengali.c	2000/11/19 17:46:15
@@ -285,34 +285,6 @@
   return (PangoEngine *) result;
 }
 
-static void
-pango_indic_engine_break (const char *text,
-			 int len,
-			 PangoAnalysis * analysis, PangoLogAttr * attrs)
-{
-  const char *cur = text;
-  gint i = 0;
-  gunichar wc;
-
-  while (*cur && cur - text < len)
-    {
-      wc = g_utf8_get_char (cur);
-      if (wc == (gunichar)-1)
-	break;			/* FIXME: ERROR */
-
-      attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == 'n') ? 1 : 0;
-      attrs[i].is_break = (i > 0 && attrs[i - 1].is_white) ||
-	attrs[i].is_white;
-      attrs[i].is_char_stop = 1;
-      attrs[i].is_word_stop = (i == 0) || attrs[i - 1].is_white;
-      /* actually, is_word_stop in not correct, but simple and good enough. */
-
-      i++;
-      cur = g_utf8_next_char (cur);
-    }
-}
-
-
 static PangoEngine *
 pango_indic_engine_lang_new ()
 {
@@ -323,7 +295,7 @@
   result->engine.id = SCRIPT_STRING "ScriptEngine";
   result->engine.type = PANGO_ENGINE_TYPE_LANG;
   result->engine.length = sizeof (result);
-  result->script_break = pango_indic_engine_break;
+  result->script_break = NULL;
 
   return (PangoEngine *) result;
 }
Index: modules/indic/devanagari.c
===================================================================
RCS file: /cvs/gnome/pango/modules/indic/devanagari.c,v
retrieving revision 1.1
diff -u -u -r1.1 devanagari.c
--- modules/indic/devanagari.c	2000/11/08 06:07:46	1.1
+++ modules/indic/devanagari.c	2000/11/19 17:46:15
@@ -338,34 +338,6 @@
   return (PangoEngine *) result;
 }
 
-static void
-pango_indic_engine_break (const char *text,
-		    int len,
-		    PangoAnalysis * analysis, PangoLogAttr * attrs)
-{
-  const char *cur = text;
-  gint i = 0;
-  gunichar wc;
-
-  while (*cur && cur - text < len)
-    {
-      wc = g_utf8_get_char (cur);
-      if (wc == (gunichar)-1)
-	break;			/* FIXME: ERROR */
-
-      attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == 'n') ? 1 : 0;
-      attrs[i].is_break = (i > 0 && attrs[i - 1].is_white) ||
-	attrs[i].is_white;
-      attrs[i].is_char_stop = 1;
-      attrs[i].is_word_stop = (i == 0) || attrs[i - 1].is_white;
-      /* actually, is_word_stop in not correct, but simple and good enough. */
-
-      i++;
-      cur = g_utf8_next_char (cur);
-    }
-}
-
-
 static PangoEngine *
 pango_indic_engine_lang_new ()
 {
@@ -376,7 +348,7 @@
   result->engine.id = SCRIPT_STRING "ScriptEngine";
   result->engine.type = PANGO_ENGINE_TYPE_LANG;
   result->engine.length = sizeof (result);
-  result->script_break = pango_indic_engine_break;
+  result->script_break = NULL;
 
   return (PangoEngine *) result;
 }
Index: modules/indic/gujarati.c
===================================================================
RCS file: /cvs/gnome/pango/modules/indic/gujarati.c,v
retrieving revision 1.1
diff -u -u -r1.1 gujarati.c
--- modules/indic/gujarati.c	2000/11/08 06:07:46	1.1
+++ modules/indic/gujarati.c	2000/11/19 17:46:15
@@ -314,34 +314,6 @@
   return (PangoEngine *) result;
 }
 
-static void
-pango_indic_engine_break (const char *text,
-			 int len,
-			 PangoAnalysis * analysis, PangoLogAttr * attrs)
-{
-  const char *cur = text;
-  gint i = 0;
-  gunichar wc;
-
-  while (*cur && cur - text < len)
-    {
-      wc = g_utf8_get_char (cur);
-      if (wc == (gunichar)-1)
-	break;			/* FIXME: ERROR */
-
-      attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == 'n') ? 1 : 0;
-      attrs[i].is_break = (i > 0 && attrs[i - 1].is_white) ||
-	attrs[i].is_white;
-      attrs[i].is_char_stop = 1;
-      attrs[i].is_word_stop = (i == 0) || attrs[i - 1].is_white;
-      /* actually, is_word_stop in not correct, but simple and good enough. */
-
-      i++;
-      cur = g_utf8_next_char (cur);
-    }
-}
-
-
 static PangoEngine *
 pango_indic_engine_lang_new ()
 {
@@ -352,7 +324,7 @@
   result->engine.id = SCRIPT_STRING "ScriptEngine";
   result->engine.type = PANGO_ENGINE_TYPE_LANG;
   result->engine.length = sizeof (result);
-  result->script_break = pango_indic_engine_break;
+  result->script_break = NULL;
 
   return (PangoEngine *) result;
 }
Index: modules/indic/gurmukhi.c
===================================================================
RCS file: /cvs/gnome/pango/modules/indic/gurmukhi.c,v
retrieving revision 1.1
diff -u -u -r1.1 gurmukhi.c
--- modules/indic/gurmukhi.c	2000/11/08 06:07:46	1.1
+++ modules/indic/gurmukhi.c	2000/11/19 17:46:15
@@ -228,34 +228,6 @@
   return (PangoEngine *) result;
 }
 
-static void
-pango_indic_engine_break (const char *text,
-			 int len,
-			 PangoAnalysis * analysis, PangoLogAttr * attrs)
-{
-  const char *cur = text;
-  gint i = 0;
-  gunichar wc;
-
-  while (*cur && cur - text < len)
-    {
-      wc = g_utf8_get_char (cur);
-      if (wc == (gunichar)-1)
-	break;			/* FIXME: ERROR */
-
-      attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == 'n') ? 1 : 0;
-      attrs[i].is_break = (i > 0 && attrs[i - 1].is_white) ||
-	attrs[i].is_white;
-      attrs[i].is_char_stop = 1;
-      attrs[i].is_word_stop = (i == 0) || attrs[i - 1].is_white;
-      /* actually, is_word_stop in not correct, but simple and good enough. */
-
-      i++;
-      cur = g_utf8_next_char (cur);
-    }
-}
-
-
 static PangoEngine *
 pango_indic_engine_lang_new ()
 {
@@ -266,7 +238,7 @@
   result->engine.id = SCRIPT_STRING "ScriptEngine";
   result->engine.type = PANGO_ENGINE_TYPE_LANG;
   result->engine.length = sizeof (result);
-  result->script_break = pango_indic_engine_break;
+  result->script_break = NULL;
 
   return (PangoEngine *) result;
 }
Index: modules/indic/myanmar.c
===================================================================
RCS file: /cvs/gnome/pango/modules/indic/myanmar.c,v
retrieving revision 1.1
diff -u -u -r1.1 myanmar.c
--- modules/indic/myanmar.c	2000/11/08 06:07:46	1.1
+++ modules/indic/myanmar.c	2000/11/19 17:46:15
@@ -208,34 +208,6 @@
   return (PangoEngine *) result;
 }
 
-static void
-pango_engine_break (const char *text,
-			 int len,
-			 PangoAnalysis * analysis, PangoLogAttr * attrs)
-{
-  const char *cur = text;
-  gint i = 0;
-  gunichar wc;
-
-  while (*cur && cur - text < len)
-    {
-      wc = g_utf8_get_char (cur);
-      if (wc == (gunichar)-1)
-	break;			/* FIXME: ERROR */
-
-      attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == 'n') ? 1 : 0;
-      attrs[i].is_break = (i > 0 && attrs[i - 1].is_white) ||
-	attrs[i].is_white;
-      attrs[i].is_char_stop = 1;
-      attrs[i].is_word_stop = (i == 0) || attrs[i - 1].is_white;
-      /* actually, is_word_stop in not correct, but simple and good enough. */
-
-      i++;
-      cur = g_utf8_next_char (cur);
-    }
-}
-
-
 static PangoEngine *
 pango_engine_lang_new ()
 {
@@ -246,7 +218,7 @@
   result->engine.id = SCRIPT_STRING "ScriptEngine";
   result->engine.type = PANGO_ENGINE_TYPE_LANG;
   result->engine.length = sizeof (result);
-  result->script_break = pango_engine_break;
+  result->script_break = NULL;
 
   return (PangoEngine *) result;
 }
Index: modules/tamil/tamil.c
===================================================================
RCS file: /cvs/gnome/pango/modules/tamil/tamil.c,v
retrieving revision 1.7
diff -u -u -r1.7 tamil.c
--- modules/tamil/tamil.c	2000/06/21 16:13:40	1.7
+++ modules/tamil/tamil.c	2000/11/19 17:46:15
@@ -36,36 +36,6 @@
  * Language script engine
  */
 
-static void 
-tamil_engine_break (const char   *text,
-		    int            len,
-		    PangoAnalysis *analysis,
-		    PangoLogAttr  *attrs)
-{
-/* Most of the code comes from pango_break
- * only difference is char stop based on modifiers
- */
-
-  const char *cur = text;
-  gint i = 0;
-  gunichar wc;
-
-  while (*cur && cur - text < len)
-    {
-      wc = g_utf8_get_char (cur);
-      if (wc == (gunichar)-1)
-	break;           /* FIXME: ERROR */
-
-      attrs[i].is_white = (wc == ' ' || wc == '\t' || wc == 'n') ? 1 : 0;
-      attrs[i].is_break = (i > 0 && attrs[i-1].is_white) || attrs[i].is_white;
-      attrs[i].is_char_stop = (is_uni_modi(wc)) ? 0 : 1;
-      attrs[i].is_word_stop = (i == 0) || attrs[i-1].is_white;
-
-      i++;
-      cur = g_utf8_next_char (cur);
-    }
-}
-
 static PangoEngine *
 tamil_engine_lang_new ()
 {
@@ -76,7 +46,7 @@
   result->engine.id = "TamilScriptEngine";
   result->engine.type = PANGO_ENGINE_TYPE_LANG;
   result->engine.length = sizeof (result);
-  result->script_break = tamil_engine_break;
+  result->script_break = NULL;
 
   return (PangoEngine *)result;
 }




[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]