[pango] Update pango_default_break function for Sentence Boundary

From: Matthias Clasen <matthiasc src gnome org>
To: commits-list gnome org
Cc:
Subject: [pango] Update pango_default_break function for Sentence Boundary
Date: Mon, 31 Jul 2017 17:53:10 +0000 (UTC)
commit 284d357e3d6e29c1437ca18bab347c1af8330908
Author: Peng Wu <alexepico gmail com>
Date:   Wed Jul 5 15:05:16 2017 +0800

    Update pango_default_break function for Sentence Boundary
    
    Re-write the code for Sentence Boundary,
    and use the code style like Grapheme Boundary and Word Boundary.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=782813

 pango/break.c |  748 +++++++++++++++++++++++----------------------------------
 1 files changed, 305 insertions(+), 443 deletions(-)
---
diff --git a/pango/break.c b/pango/break.c
index 5b2128d..1c36d49 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -432,27 +432,6 @@ static const CharJamoProps HangulJamoProps[] = {
 #define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
 #define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && 
!HANGUL(wc))
 
-/* p. 132-133 of Unicode spec table 5-6 will help understand this */
-typedef enum
-{
-  STATE_SENTENCE_OUTSIDE,
-  STATE_SENTENCE_BODY,
-  STATE_SENTENCE_TERM,
-  STATE_SENTENCE_POST_TERM_CLOSE,
-  STATE_SENTENCE_POST_TERM_SPACE,
-  STATE_SENTENCE_POST_TERM_SEP,
-  STATE_SENTENCE_DOT,
-  STATE_SENTENCE_POST_DOT_CLOSE,
-  STATE_SENTENCE_POST_DOT_SPACE,
-  STATE_SENTENCE_POST_DOT_OPEN,
-  /* never include line/para separators in a sentence for now */
-  /* This isn't in the spec, but I can't figure out why they'd include
-   * one line/para separator in lines ending with Term but not with
-   * period-terminated lines, so I'm doing it for the dot lines also
-   */
-  STATE_SENTENCE_POST_DOT_SEP
-} SentenceState;
-
 /* Previously "123foo" was two words. But in UAX 29 of Unicode, 
  * we know don't break words between consecutive letters and numbers
  */
@@ -508,7 +487,6 @@ pango_default_break (const gchar   *text,
   JamoType prev_jamo;
 
   GUnicodeBreakType next_break_type;
-  GUnicodeType prev_type;
   GUnicodeBreakType prev_break_type; /* skips spaces */
   gboolean prev_was_break_space;
 
@@ -553,17 +531,34 @@ pango_default_break (const gchar   *text,
   WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
   gint prev_WB_i = -1;
 
+  /* See Sentence_Break Property Values table of UAX#29 */
+  typedef enum
+  {
+    SB_Other,
+    SB_ExtendFormat,
+    SB_ParaSep,
+    SB_Sp,
+    SB_Lower,
+    SB_Upper,
+    SB_OLetter,
+    SB_Numeric,
+    SB_ATerm,
+    SB_SContinue,
+    SB_STerm,
+    SB_Close,
+    /* Rules SB8 and SB8a */
+    SB_ATerm_Close_Sp,
+    SB_STerm_Close_Sp,
+  } SentenceBreakType;
+  SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
+  gint prev_SB_i = -1;
+
   WordType current_word_type = WordNone;
   gunichar last_word_letter = 0;
   gunichar base_character = 0;
 
-  SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;
-  /* Tracks what will be the end of the sentence if a period is
-   * determined to actually be a sentence-ending period.
-   */
-  gint possible_sentence_end = -1;
-  /* possible sentence break before Open* after a period-ended sentence */
-  gint possible_sentence_boundary = -1;
+  gint last_sentence_start = -1;
+
   gboolean almost_done = FALSE;
   gboolean done = FALSE;
 
@@ -572,7 +567,6 @@ pango_default_break (const gchar   *text,
 
   next = text;
 
-  prev_type = G_UNICODE_PARAGRAPH_SEPARATOR;
   prev_break_type = G_UNICODE_BREAK_UNKNOWN;
   prev_was_break_space = FALSE;
   prev_wc = 0;
@@ -601,6 +595,7 @@ pango_default_break (const gchar   *text,
       /* UAX#29 boundaries */
       gboolean is_grapheme_boundary;
       gboolean is_word_boundary;
+      gboolean is_sentence_boundary;
 
 
       wc = next_wc;
@@ -1078,6 +1073,276 @@ pango_default_break (const gchar   *text,
        attrs[i].is_word_boundary = is_word_boundary;
       }
 
+      /* ---- UAX#29 Sentence Boundaries ---- */
+      {
+       is_sentence_boundary = FALSE;
+       if (is_word_boundary ||
+           wc == '\r' || wc == '\n') /* Rules SB3 and SB5 */
+         {
+           SentenceBreakType SB_type;
+
+           /* Find the SentenceBreakType of wc */
+           SB_type = SB_Other;
+
+           if (break_type == G_UNICODE_BREAK_NUMERIC)
+             SB_type = SB_Numeric; /* Numeric */
+
+           if (SB_type == SB_Other)
+             switch ((int) type)
+               {
+               case G_UNICODE_CONTROL:
+                 if (wc == '\r' || wc == '\n')
+                   SB_type = SB_ParaSep;
+                 else if (wc == 0x0009 || wc == 0x000B || wc == 0x000C)
+                   SB_type = SB_Sp;
+                 else if (wc == 0x0085)
+                   SB_type = SB_ParaSep;
+                 break;
+
+               case G_UNICODE_SPACE_SEPARATOR:
+                 if (wc == 0x0020 || wc == 0x00A0 || wc == 0x1680 ||
+                     (wc >= 0x2000 && wc <= 0x200A) ||
+                     wc == 0x202F || wc == 0x205F || wc == 0x3000)
+                   SB_type = SB_Sp;
+                 break;
+
+               case G_UNICODE_LINE_SEPARATOR:
+               case G_UNICODE_PARAGRAPH_SEPARATOR:
+                 SB_type = SB_ParaSep;
+                 break;
+
+               case G_UNICODE_FORMAT:
+               case G_UNICODE_SPACING_MARK:
+               case G_UNICODE_ENCLOSING_MARK:
+               case G_UNICODE_NON_SPACING_MARK:
+                 SB_type = SB_ExtendFormat; /* Extend, Format */
+                 break;
+
+               case G_UNICODE_MODIFIER_LETTER:
+                 if (wc >= 0xFF9E && wc <= 0xFF9F)
+                   SB_type = SB_ExtendFormat; /* Other_Grapheme_Extend */
+                 break;
+
+               case G_UNICODE_TITLECASE_LETTER:
+                 SB_type = SB_Upper;
+                 break;
+
+               case G_UNICODE_DASH_PUNCTUATION:
+                 if (wc == 0x002D ||
+                     (wc >= 0x2013 && wc <= 0x2014) ||
+                     (wc >= 0xFE31 && wc <= 0xFE32) ||
+                     wc == 0xFE58 ||
+                     wc == 0xFE63 ||
+                     wc == 0xFF0D)
+                   SB_type = SB_SContinue;
+                 break;
+
+               case G_UNICODE_OTHER_PUNCTUATION:
+                 if (wc == 0x05F3)
+                   SB_type = SB_OLetter;
+                 else if (wc == 0x002E || wc == 0x2024 ||
+                     wc == 0xFE52 || wc == 0xFF0E)
+                   SB_type = SB_ATerm;
+
+                 if (wc == 0x002C ||
+                     wc == 0x003A ||
+                     wc == 0x055D ||
+                     (wc >= 0x060C && wc <= 0x060D) ||
+                     wc == 0x07F8 ||
+                     wc == 0x1802 ||
+                     wc == 0x1808 ||
+                     wc == 0x3001 ||
+                     (wc >= 0xFE10 && wc <= 0xFE11) ||
+                     wc == 0xFE13 ||
+                     (wc >= 0xFE50 && wc <= 0xFE51) ||
+                     wc == 0xFE55 ||
+                     wc == 0xFF0C ||
+                     wc == 0xFF1A ||
+                     wc == 0xFF64)
+                   SB_type = SB_SContinue;
+
+                 if (wc == 0x0021 ||
+                     wc == 0x003F ||
+                     wc == 0x0589 ||
+                     wc == 0x061F ||
+                     wc == 0x06D4 ||
+                     (wc >= 0x0700 && wc <= 0x0702) ||
+                     wc == 0x07F9 ||
+                     (wc >= 0x0964 && wc <= 0x0965) ||
+                     (wc >= 0x104A && wc <= 0x104B) ||
+                     wc == 0x1362 ||
+                     (wc >= 0x1367 && wc <= 0x1368) ||
+                     wc == 0x166E ||
+                     (wc >= 0x1735 && wc <= 0x1736) ||
+                     wc == 0x1803 ||
+                     wc == 0x1809 ||
+                     (wc >= 0x1944 && wc <= 0x1945) ||
+                     (wc >= 0x1AA8 && wc <= 0x1AAB) ||
+                     (wc >= 0x1B5A && wc <= 0x1B5B) ||
+                     (wc >= 0x1B5E && wc <= 0x1B5F) ||
+                     (wc >= 0x1C3B && wc <= 0x1C3C) ||
+                     (wc >= 0x1C7E && wc <= 0x1C7F) ||
+                     (wc >= 0x203C && wc <= 0x203D) ||
+                     (wc >= 0x2047 && wc <= 0x2049) ||
+                     wc == 0x2E2E ||
+                     wc == 0x2E3C ||
+                     wc == 0x3002 ||
+                     wc == 0xA4FF ||
+                     (wc >= 0xA60E && wc <= 0xA60F) ||
+                     wc == 0xA6F3 ||
+                     wc == 0xA6F7 ||
+                     (wc >= 0xA876 && wc <= 0xA877) ||
+                     (wc >= 0xA8CE && wc <= 0xA8CF) ||
+                     wc == 0xA92F ||
+                     (wc >= 0xA9C8 && wc <= 0xA9C9) ||
+                     (wc >= 0xAA5D && wc <= 0xAA5F) ||
+                     (wc >= 0xAAF0 && wc <= 0xAAF1) ||
+                     wc == 0xABEB ||
+                     (wc >= 0xFE56 && wc <= 0xFE57) ||
+                     wc == 0xFF01 ||
+                     wc == 0xFF1F ||
+                     wc == 0xFF61 ||
+                     (wc >= 0x10A56 && wc <= 0x10A57) ||
+                     (wc >= 0x11047 && wc <= 0x11048) ||
+                     (wc >= 0x110BE && wc <= 0x110C1) ||
+                     (wc >= 0x11141 && wc <= 0x11143) ||
+                     (wc >= 0x111C5 && wc <= 0x111C6) ||
+                     wc == 0x111CD ||
+                     (wc >= 0x111DE && wc <= 0x111DF) ||
+                     (wc >= 0x11238 && wc <= 0x11239) ||
+                     (wc >= 0x1123B && wc <= 0x1123C) ||
+                     wc == 0x112A9 ||
+                     (wc >= 0x1144B && wc <= 0x1144C) ||
+                     (wc >= 0x115C2 && wc <= 0x115C3) ||
+                     (wc >= 0x115C9 && wc <= 0x115D7) ||
+                     (wc >= 0x11641 && wc <= 0x11642) ||
+                     (wc >= 0x1173C && wc <= 0x1173E) ||
+                     (wc >= 0x11C41 && wc <= 0x11C42) ||
+                     (wc >= 0x16A6E && wc <= 0x16A6F) ||
+                     wc == 0x16AF5 ||
+                     (wc >= 0x16B37 && wc <= 0x16B38) ||
+                     wc == 0x16B44 ||
+                     wc == 0x1BC9F ||
+                     wc == 0x1DA88)
+                   SB_type = SB_STerm;
+
+                 break;
+               }
+
+           if (SB_type == SB_Other)
+             {
+               if (g_unichar_islower(wc))
+                 SB_type = SB_Lower;
+               else if (g_unichar_isupper(wc))
+                 SB_type = SB_Upper;
+               else if (g_unichar_isalpha(wc))
+                 SB_type = SB_OLetter;
+
+               if (type == G_UNICODE_OPEN_PUNCTUATION ||
+                   type == G_UNICODE_CLOSE_PUNCTUATION ||
+                   break_type == G_UNICODE_BREAK_QUOTATION)
+                 SB_type = SB_Close;
+             }
+
+           /* Sentence Boundary Rules */
+
+           /* We apply Rules SB1 and SB2 at the end of the function */
+
+#define IS_OTHER_TERM(SB_type)                                         \
+           /* not in (OLetter | Upper | Lower | ParaSep | SATerm) */   \
+             !(SB_type == SB_OLetter ||                                \
+               SB_type == SB_Upper || SB_type == SB_Lower ||           \
+               SB_type == SB_ParaSep ||                                \
+               SB_type == SB_ATerm || SB_type == SB_STerm ||           \
+               SB_type == SB_ATerm_Close_Sp ||                         \
+               SB_type == SB_STerm_Close_Sp)
+
+
+           if (wc == '\n' && prev_wc == '\r')
+             is_sentence_boundary = FALSE; /* Rule SB3 */
+           else if (prev_SB_type == SB_ParaSep && prev_SB_i + 1 == i)
+             {
+               /* The extra check for prev_SB_i is to correctly handle sequences like
+                * ParaSep ÷ Extend × Extend
+                * since we have not skipped ExtendFormat yet.
+                */
+
+               is_sentence_boundary = TRUE; /* Rule SB4 */
+             }
+           else if (SB_type == SB_ExtendFormat)
+             is_sentence_boundary = FALSE; /* Rule SB5? */
+           else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
+             is_sentence_boundary = FALSE; /* Rule SB6 */
+           else if ((prev_prev_SB_type == SB_Upper ||
+                     prev_prev_SB_type == SB_Lower) &&
+                    prev_SB_type == SB_ATerm &&
+                    SB_type == SB_Upper)
+             is_sentence_boundary = FALSE; /* Rule SB7 */
+           else if (prev_SB_type == SB_ATerm && SB_type == SB_Close)
+               SB_type = SB_ATerm;
+           else if (prev_SB_type == SB_STerm && SB_type == SB_Close)
+             SB_type = SB_STerm;
+           else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp)
+             SB_type = SB_ATerm_Close_Sp;
+           else if (prev_SB_type == SB_STerm && SB_type == SB_Sp)
+             SB_type = SB_STerm_Close_Sp;
+           /* Rule SB8 */
+           else if ((prev_SB_type == SB_ATerm ||
+                     prev_SB_type == SB_ATerm_Close_Sp) &&
+                    SB_type == SB_Lower)
+             is_sentence_boundary = FALSE;
+           else if ((prev_prev_SB_type == SB_ATerm ||
+                     prev_prev_SB_type == SB_ATerm_Close_Sp) &&
+                    IS_OTHER_TERM(prev_SB_type) &&
+                    SB_type == SB_Lower)
+             attrs[prev_SB_i].is_sentence_boundary = FALSE;
+           else if ((prev_SB_type == SB_ATerm ||
+                     prev_SB_type == SB_ATerm_Close_Sp ||
+                     prev_SB_type == SB_STerm ||
+                     prev_SB_type == SB_STerm_Close_Sp) &&
+                    (SB_type == SB_SContinue ||
+                     SB_type == SB_ATerm || SB_type == SB_STerm))
+             is_sentence_boundary = FALSE; /* Rule SB8a */
+           else if ((prev_SB_type == SB_ATerm ||
+                     prev_SB_type == SB_STerm) &&
+                    (SB_type == SB_Close || SB_type == SB_Sp ||
+                     SB_type == SB_ParaSep))
+             is_sentence_boundary = FALSE; /* Rule SB9 */
+           else if ((prev_SB_type == SB_ATerm ||
+                     prev_SB_type == SB_ATerm_Close_Sp ||
+                     prev_SB_type == SB_STerm ||
+                     prev_SB_type == SB_STerm_Close_Sp) &&
+                    (SB_type == SB_Sp || SB_type == SB_ParaSep))
+             is_sentence_boundary = FALSE; /* Rule SB10 */
+           else if ((prev_SB_type == SB_ATerm ||
+                     prev_SB_type == SB_ATerm_Close_Sp ||
+                     prev_SB_type == SB_STerm ||
+                     prev_SB_type == SB_STerm_Close_Sp) &&
+                    SB_type != SB_ParaSep)
+             is_sentence_boundary = TRUE; /* Rule SB11 */
+           else
+             is_sentence_boundary = FALSE; /* Rule SB998 */
+
+           if (SB_type != SB_ExtendFormat &&
+               !((prev_prev_SB_type == SB_ATerm ||
+                  prev_prev_SB_type == SB_ATerm_Close_Sp) &&
+                 IS_OTHER_TERM(prev_SB_type) &&
+                 IS_OTHER_TERM(SB_type)))
+              {
+                prev_prev_SB_type = prev_SB_type;
+                prev_SB_type = SB_type;
+                prev_SB_i = i;
+              }
+
+#undef IS_OTHER_TERM
+
+         }
+
+       if (i == 0 || done)
+         is_sentence_boundary = TRUE; /* Rules SB1 and SB2 */
+
+       attrs[i].is_sentence_boundary = is_sentence_boundary;
+      }
 
       /* ---- Line breaking ---- */
 
@@ -1371,424 +1636,20 @@ pango_default_break (const gchar   *text,
 
       /* ---- Sentence breaks ---- */
 
-      /* The Unicode spec specifies sentence breakpoints, so that a piece of
-       * text would be partitioned into sentences, and all characters would
-       * be inside some sentence. This code implements that for is_sentence_boundary,
-       * but tries to keep leading/trailing whitespace out of sentences for
-       * the start/end flags
-       */
-
-      /* The Unicode spec seems to say that one trailing line/para
-       * separator can be tacked on to a sentence ending in ! or ?,
-       * but not a sentence ending in period; I think they're on crack
-       * so am allowing one to be tacked onto a sentence ending in period.
-       */
-
-#define MAYBE_START_NEW_SENTENCE                                \
-             switch ((int) type)                               \
-               {                                               \
-               case G_UNICODE_LINE_SEPARATOR:                  \
-               case G_UNICODE_PARAGRAPH_SEPARATOR:             \
-               case G_UNICODE_CONTROL:                         \
-               case G_UNICODE_FORMAT:                          \
-               case G_UNICODE_SPACE_SEPARATOR:                 \
-                 sentence_state = STATE_SENTENCE_OUTSIDE;      \
-                 break;                                        \
-                                                               \
-               default:                                        \
-                 sentence_state = STATE_SENTENCE_BODY;         \
-                 attrs[i].is_sentence_start = TRUE;            \
-                 break;                                        \
-               }
-
-      /* No sentence break at the start of the text */
-
-      /* default to not a sentence breakpoint */
-      attrs[i].is_sentence_boundary = FALSE;
+      /* default to not a sentence start/end */
       attrs[i].is_sentence_start = FALSE;
       attrs[i].is_sentence_end = FALSE;
 
-      /* FIXME the Unicode spec lumps control/format chars with
-       * line/para separators in descriptive text, but not in the
-       * character class specs, in table 5-6, so who knows whether you
-       * are actually supposed to break on control/format
-       * characters. Seems semi-broken to break on tabs...
-       */
-
-      /* Break after line/para separators except carriage return
-       * followed by newline
-       */
-      switch ((int) prev_type)
-       {
-       case G_UNICODE_LINE_SEPARATOR:
-       case G_UNICODE_PARAGRAPH_SEPARATOR:
-       case G_UNICODE_CONTROL:
-       case G_UNICODE_FORMAT:
-         if (wc == '\r')
-           {
-             if (next_wc != '\n')
-               attrs[i].is_sentence_boundary = TRUE;
-           }
-         else
-           attrs[i].is_sentence_boundary = TRUE;
-         break;
-
-       default:
-         break;
-       }
-
-      /* break before para/line separators except newline following
-       * carriage return
-       */
-      switch ((int) type)
-       {
-       case G_UNICODE_LINE_SEPARATOR:
-       case G_UNICODE_PARAGRAPH_SEPARATOR:
-       case G_UNICODE_CONTROL:
-       case G_UNICODE_FORMAT:
-         if (wc == '\n')
-           {
-             if (prev_wc != '\r')
-               attrs[i].is_sentence_boundary = TRUE;
-           }
-         else
-           attrs[i].is_sentence_boundary = TRUE;
-         break;
-
-       default:
-         break;
-       }
-
-      switch (sentence_state)
-       {
-       case STATE_SENTENCE_OUTSIDE:
-         /* Start sentence if we have non-whitespace/format/control */
-         switch ((int) type)
-           {
-           case G_UNICODE_LINE_SEPARATOR:
-           case G_UNICODE_PARAGRAPH_SEPARATOR:
-           case G_UNICODE_CONTROL:
-           case G_UNICODE_FORMAT:
-           case G_UNICODE_SPACE_SEPARATOR:
-             break;
-
-           default:
-             attrs[i].is_sentence_start = TRUE;
-             sentence_state = STATE_SENTENCE_BODY;
-             break;
-           }
-         break;
-
-       case STATE_SENTENCE_BODY:
-         /* If we already broke here due to separators, end the sentence. */
-         if (attrs[i].is_sentence_boundary)
-           {
-             attrs[i].is_sentence_end = TRUE;
-
-             MAYBE_START_NEW_SENTENCE;
-           }
-         else
-           {
-             if (wc == '.')
-               sentence_state = STATE_SENTENCE_DOT;
-             else if (wc == '?' || wc == '!')
-               sentence_state = STATE_SENTENCE_TERM;
-           }
-         break;
-
-       case STATE_SENTENCE_TERM:
-         /* End sentence on anything but close punctuation and some
-          * loosely-specified OTHER_PUNCTUATION such as period,
-          * comma, etc.; follow Unicode rules for breaks
-          */
-         switch ((int) type)
-           {
-           case G_UNICODE_OTHER_PUNCTUATION:
-           case G_UNICODE_CLOSE_PUNCTUATION:
-             if (type == G_UNICODE_CLOSE_PUNCTUATION ||
-                 wc == '.' ||
-                 wc == ',' ||
-                 wc == '?' ||
-                 wc == '!')
-               sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;
-             else
-               {
-                 attrs[i].is_sentence_end = TRUE;
-                 attrs[i].is_sentence_boundary = TRUE;
-
-                 MAYBE_START_NEW_SENTENCE;
-               }
-             break;
-
-           case G_UNICODE_SPACE_SEPARATOR:
-             attrs[i].is_sentence_end = TRUE;
-             sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
-             break;
-
-           case G_UNICODE_LINE_SEPARATOR:
-           case G_UNICODE_PARAGRAPH_SEPARATOR:
-             attrs[i].is_sentence_end = TRUE;
-             sentence_state = STATE_SENTENCE_POST_TERM_SEP;
-             break;
-
-           default:
-             attrs[i].is_sentence_end = TRUE;
-             attrs[i].is_sentence_boundary = TRUE;
-
-             MAYBE_START_NEW_SENTENCE;
-
-             break;
-           }
-         break;
-
-       case STATE_SENTENCE_POST_TERM_CLOSE:
-         /* End sentence on anything besides more punctuation; follow
-          * rules for breaks
-          */
-         switch ((int) type)
-           {
-           case G_UNICODE_OTHER_PUNCTUATION:
-           case G_UNICODE_CLOSE_PUNCTUATION:
-             if (type == G_UNICODE_CLOSE_PUNCTUATION ||
-                 wc == '.' ||
-                 wc == ',' ||
-                 wc == '?' ||
-                 wc == '!')
-               /* continue in this state */
-               ;
-             else
-               {
-                 attrs[i].is_sentence_end = TRUE;
-                 attrs[i].is_sentence_boundary = TRUE;
-
-                 MAYBE_START_NEW_SENTENCE;
-               }
-             break;
-
-           case G_UNICODE_SPACE_SEPARATOR:
-             attrs[i].is_sentence_end = TRUE;
-             sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
-             break;
-
-           case G_UNICODE_LINE_SEPARATOR:
-           case G_UNICODE_PARAGRAPH_SEPARATOR:
-             attrs[i].is_sentence_end = TRUE;
-             /* undo the unconditional break-at-all-line/para-separators
-              * from above; I'm not sure this is what the Unicode spec
-              * intends, but it seems right - we get to include
-              * a single line/para separator in the sentence according
-              * to their rules
-              */
-             attrs[i].is_sentence_boundary = FALSE;
-             sentence_state = STATE_SENTENCE_POST_TERM_SEP;
-             break;
-
-           default:
-             attrs[i].is_sentence_end = TRUE;
-             attrs[i].is_sentence_boundary = TRUE;
-
-             MAYBE_START_NEW_SENTENCE;
-
-             break;
-           }
-         break;
-
-       case STATE_SENTENCE_POST_TERM_SPACE:
-
-         /* Sentence is definitely already ended; to enter this state
-          * we had to see a space, which ends the sentence.
-          */
-
-         switch ((int) type)
-           {
-           case G_UNICODE_SPACE_SEPARATOR:
-             /* continue in this state */
-             break;
-
-           case G_UNICODE_LINE_SEPARATOR:
-           case G_UNICODE_PARAGRAPH_SEPARATOR:
-             /* undo the unconditional break-at-all-line/para-separators
-              * from above; I'm not sure this is what the Unicode spec
-              * intends, but it seems right
-              */
-             attrs[i].is_sentence_boundary = FALSE;
-             sentence_state = STATE_SENTENCE_POST_TERM_SEP;
-             break;
-
-           default:
-             attrs[i].is_sentence_boundary = TRUE;
-
-             MAYBE_START_NEW_SENTENCE;
-
-             break;
-           }
-         break;
-
-       case STATE_SENTENCE_POST_TERM_SEP:
-         /* Break is forced at this point, unless we're a newline
-          * after a CR, then we will break after the newline on the
-          * next iteration. Only a single Sep can be in the
-          * sentence.
-          */
-         if (!(prev_wc == '\r' && wc == '\n'))
-           attrs[i].is_sentence_boundary = TRUE;
-
-         MAYBE_START_NEW_SENTENCE;
-
-         break;
-
-       case STATE_SENTENCE_DOT:
-         switch ((int) type)
-           {
-           case G_UNICODE_CLOSE_PUNCTUATION:
-             sentence_state = STATE_SENTENCE_POST_DOT_CLOSE;
-             break;
-
-           case G_UNICODE_SPACE_SEPARATOR:
-             possible_sentence_end = i;
-             sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
-             break;
-
-           default:
-             /* If we broke on a control/format char, end the
-              * sentence; else this was not a sentence end, since
-              * we didn't enter the POST_DOT_SPACE state.
-              */
-             if (attrs[i].is_sentence_boundary)
-               {
-                 attrs[i].is_sentence_end = TRUE;
-
-                 MAYBE_START_NEW_SENTENCE;
-               }
-             else
-               sentence_state = STATE_SENTENCE_BODY;
-             break;
-           }
-         break;
-
-       case STATE_SENTENCE_POST_DOT_CLOSE:
-         switch ((int) type)
-           {
-           case G_UNICODE_SPACE_SEPARATOR:
-             possible_sentence_end = i;
-             sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
-             break;
-
-           default:
-             /* If we broke on a control/format char, end the
-              * sentence; else this was not a sentence end, since
-              * we didn't enter the POST_DOT_SPACE state.
-              */
-             if (attrs[i].is_sentence_boundary)
-               {
-                 attrs[i].is_sentence_end = TRUE;
-
-                 MAYBE_START_NEW_SENTENCE;
-               }
-             else
-               sentence_state = STATE_SENTENCE_BODY;
-             break;
-           }
-         break;
-
-       case STATE_SENTENCE_POST_DOT_SPACE:
-
-         possible_sentence_boundary = i;
-
-         switch ((int) type)
-           {
-           case G_UNICODE_SPACE_SEPARATOR:
-             /* remain in current state */
-             break;
-
-           case G_UNICODE_OPEN_PUNCTUATION:
-             sentence_state = STATE_SENTENCE_POST_DOT_OPEN;
-             break;
-
-           case G_UNICODE_LOWERCASE_LETTER:
-             /* wasn't a sentence-ending period; so re-enter the sentence
-              * body
-              */
-             sentence_state = STATE_SENTENCE_BODY;
-             break;
-
-           default:
-             /* End the sentence, break, maybe start a new one */
-
-             g_assert (possible_sentence_end >= 0);
-             g_assert (possible_sentence_boundary >= 0);
-
-             attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
-             attrs[possible_sentence_end].is_sentence_end = TRUE;
-
-             possible_sentence_end = -1;
-             possible_sentence_boundary = -1;
-
-             MAYBE_START_NEW_SENTENCE;
-
-             break;
-           }
-         break;
-
-       case STATE_SENTENCE_POST_DOT_OPEN:
-         switch ((int) type)
-           {
-           case G_UNICODE_OPEN_PUNCTUATION:
-             /* continue in current state */
-             break;
-
-           case G_UNICODE_LOWERCASE_LETTER:
-             /* wasn't a sentence-ending period; so re-enter the sentence
-              * body
-              */
-             sentence_state = STATE_SENTENCE_BODY;
-             break;
-
-           default:
-             /* End the sentence, break, maybe start a new one */
-
-             g_assert (possible_sentence_end >= 0);
-             g_assert (possible_sentence_boundary >= 0);
-
-             attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
-             attrs[possible_sentence_end].is_sentence_end = TRUE;
-
-             possible_sentence_end = -1;
-             possible_sentence_boundary = -1;
-
-             MAYBE_START_NEW_SENTENCE;
-
-             break;
-           }
-         break;
-
-       case STATE_SENTENCE_POST_DOT_SEP:
-         /* Break is forced at this point, unless we're a newline
-          * after a CR, then we will break after the newline on the
-          * next iteration. Only a single Sep can be in the
-          * sentence.
-          */
-         if (!(prev_wc == '\r' && wc == '\n'))
-           attrs[i].is_sentence_boundary = TRUE;
-
-         g_assert (possible_sentence_end >= 0);
-         g_assert (possible_sentence_boundary >= 0);
-
-         attrs[possible_sentence_end].is_sentence_end = TRUE;
-
-         possible_sentence_end = -1;
-         possible_sentence_boundary = -1;
-
-         MAYBE_START_NEW_SENTENCE;
-
-         break;
+      if (last_sentence_start == -1 && !is_sentence_boundary) {
+       last_sentence_start = i - 1;
+       attrs[i - 1].is_sentence_start = TRUE;
+      }
 
-       default:
-         g_assert_not_reached ();
-         break;
-       }
+      if (last_sentence_start != -1 && is_sentence_boundary) {
+       last_sentence_start = -1;
+       attrs[i].is_sentence_end = TRUE;
+      }
 
-      prev_type = type;
       prev_wc = wc;
 
       /* wc might not be a valid Unicode base character, but really all we
@@ -1798,6 +1659,7 @@ pango_default_break (const gchar   *text,
          type != G_UNICODE_NON_SPACING_MARK)
        base_character = wc;
     }
+
   i--;
 
   attrs[i].is_cursor_position = TRUE;  /* Rule GB2 */
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]