[pango] Update pango_default_break function for Sentence Boundary
- From: Matthias Clasen <matthiasc src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [pango] Update pango_default_break function for Sentence Boundary
- Date: Mon, 31 Jul 2017 17:53:10 +0000 (UTC)
commit 284d357e3d6e29c1437ca18bab347c1af8330908
Author: Peng Wu <alexepico gmail com>
Date: Wed Jul 5 15:05:16 2017 +0800
Update pango_default_break function for Sentence Boundary
Re-write the code for Sentence Boundary,
and use the code style like Grapheme Boundary and Word Boundary.
https://bugzilla.gnome.org/show_bug.cgi?id=782813
pango/break.c | 748 +++++++++++++++++++++++----------------------------------
1 files changed, 305 insertions(+), 443 deletions(-)
---
diff --git a/pango/break.c b/pango/break.c
index 5b2128d..1c36d49 100644
--- a/pango/break.c
+++ b/pango/break.c
@@ -432,27 +432,6 @@ static const CharJamoProps HangulJamoProps[] = {
#define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
#define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) &&
!HANGUL(wc))
-/* p. 132-133 of Unicode spec table 5-6 will help understand this */
-typedef enum
-{
- STATE_SENTENCE_OUTSIDE,
- STATE_SENTENCE_BODY,
- STATE_SENTENCE_TERM,
- STATE_SENTENCE_POST_TERM_CLOSE,
- STATE_SENTENCE_POST_TERM_SPACE,
- STATE_SENTENCE_POST_TERM_SEP,
- STATE_SENTENCE_DOT,
- STATE_SENTENCE_POST_DOT_CLOSE,
- STATE_SENTENCE_POST_DOT_SPACE,
- STATE_SENTENCE_POST_DOT_OPEN,
- /* never include line/para separators in a sentence for now */
- /* This isn't in the spec, but I can't figure out why they'd include
- * one line/para separator in lines ending with Term but not with
- * period-terminated lines, so I'm doing it for the dot lines also
- */
- STATE_SENTENCE_POST_DOT_SEP
-} SentenceState;
-
/* Previously "123foo" was two words. But in UAX 29 of Unicode,
* we know don't break words between consecutive letters and numbers
*/
@@ -508,7 +487,6 @@ pango_default_break (const gchar *text,
JamoType prev_jamo;
GUnicodeBreakType next_break_type;
- GUnicodeType prev_type;
GUnicodeBreakType prev_break_type; /* skips spaces */
gboolean prev_was_break_space;
@@ -553,17 +531,34 @@ pango_default_break (const gchar *text,
WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
gint prev_WB_i = -1;
+ /* See Sentence_Break Property Values table of UAX#29 */
+ typedef enum
+ {
+ SB_Other,
+ SB_ExtendFormat,
+ SB_ParaSep,
+ SB_Sp,
+ SB_Lower,
+ SB_Upper,
+ SB_OLetter,
+ SB_Numeric,
+ SB_ATerm,
+ SB_SContinue,
+ SB_STerm,
+ SB_Close,
+ /* Rules SB8 and SB8a */
+ SB_ATerm_Close_Sp,
+ SB_STerm_Close_Sp,
+ } SentenceBreakType;
+ SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
+ gint prev_SB_i = -1;
+
WordType current_word_type = WordNone;
gunichar last_word_letter = 0;
gunichar base_character = 0;
- SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;
- /* Tracks what will be the end of the sentence if a period is
- * determined to actually be a sentence-ending period.
- */
- gint possible_sentence_end = -1;
- /* possible sentence break before Open* after a period-ended sentence */
- gint possible_sentence_boundary = -1;
+ gint last_sentence_start = -1;
+
gboolean almost_done = FALSE;
gboolean done = FALSE;
@@ -572,7 +567,6 @@ pango_default_break (const gchar *text,
next = text;
- prev_type = G_UNICODE_PARAGRAPH_SEPARATOR;
prev_break_type = G_UNICODE_BREAK_UNKNOWN;
prev_was_break_space = FALSE;
prev_wc = 0;
@@ -601,6 +595,7 @@ pango_default_break (const gchar *text,
/* UAX#29 boundaries */
gboolean is_grapheme_boundary;
gboolean is_word_boundary;
+ gboolean is_sentence_boundary;
wc = next_wc;
@@ -1078,6 +1073,276 @@ pango_default_break (const gchar *text,
attrs[i].is_word_boundary = is_word_boundary;
}
+ /* ---- UAX#29 Sentence Boundaries ---- */
+ {
+ is_sentence_boundary = FALSE;
+ if (is_word_boundary ||
+ wc == '\r' || wc == '\n') /* Rules SB3 and SB5 */
+ {
+ SentenceBreakType SB_type;
+
+ /* Find the SentenceBreakType of wc */
+ SB_type = SB_Other;
+
+ if (break_type == G_UNICODE_BREAK_NUMERIC)
+ SB_type = SB_Numeric; /* Numeric */
+
+ if (SB_type == SB_Other)
+ switch ((int) type)
+ {
+ case G_UNICODE_CONTROL:
+ if (wc == '\r' || wc == '\n')
+ SB_type = SB_ParaSep;
+ else if (wc == 0x0009 || wc == 0x000B || wc == 0x000C)
+ SB_type = SB_Sp;
+ else if (wc == 0x0085)
+ SB_type = SB_ParaSep;
+ break;
+
+ case G_UNICODE_SPACE_SEPARATOR:
+ if (wc == 0x0020 || wc == 0x00A0 || wc == 0x1680 ||
+ (wc >= 0x2000 && wc <= 0x200A) ||
+ wc == 0x202F || wc == 0x205F || wc == 0x3000)
+ SB_type = SB_Sp;
+ break;
+
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ SB_type = SB_ParaSep;
+ break;
+
+ case G_UNICODE_FORMAT:
+ case G_UNICODE_SPACING_MARK:
+ case G_UNICODE_ENCLOSING_MARK:
+ case G_UNICODE_NON_SPACING_MARK:
+ SB_type = SB_ExtendFormat; /* Extend, Format */
+ break;
+
+ case G_UNICODE_MODIFIER_LETTER:
+ if (wc >= 0xFF9E && wc <= 0xFF9F)
+ SB_type = SB_ExtendFormat; /* Other_Grapheme_Extend */
+ break;
+
+ case G_UNICODE_TITLECASE_LETTER:
+ SB_type = SB_Upper;
+ break;
+
+ case G_UNICODE_DASH_PUNCTUATION:
+ if (wc == 0x002D ||
+ (wc >= 0x2013 && wc <= 0x2014) ||
+ (wc >= 0xFE31 && wc <= 0xFE32) ||
+ wc == 0xFE58 ||
+ wc == 0xFE63 ||
+ wc == 0xFF0D)
+ SB_type = SB_SContinue;
+ break;
+
+ case G_UNICODE_OTHER_PUNCTUATION:
+ if (wc == 0x05F3)
+ SB_type = SB_OLetter;
+ else if (wc == 0x002E || wc == 0x2024 ||
+ wc == 0xFE52 || wc == 0xFF0E)
+ SB_type = SB_ATerm;
+
+ if (wc == 0x002C ||
+ wc == 0x003A ||
+ wc == 0x055D ||
+ (wc >= 0x060C && wc <= 0x060D) ||
+ wc == 0x07F8 ||
+ wc == 0x1802 ||
+ wc == 0x1808 ||
+ wc == 0x3001 ||
+ (wc >= 0xFE10 && wc <= 0xFE11) ||
+ wc == 0xFE13 ||
+ (wc >= 0xFE50 && wc <= 0xFE51) ||
+ wc == 0xFE55 ||
+ wc == 0xFF0C ||
+ wc == 0xFF1A ||
+ wc == 0xFF64)
+ SB_type = SB_SContinue;
+
+ if (wc == 0x0021 ||
+ wc == 0x003F ||
+ wc == 0x0589 ||
+ wc == 0x061F ||
+ wc == 0x06D4 ||
+ (wc >= 0x0700 && wc <= 0x0702) ||
+ wc == 0x07F9 ||
+ (wc >= 0x0964 && wc <= 0x0965) ||
+ (wc >= 0x104A && wc <= 0x104B) ||
+ wc == 0x1362 ||
+ (wc >= 0x1367 && wc <= 0x1368) ||
+ wc == 0x166E ||
+ (wc >= 0x1735 && wc <= 0x1736) ||
+ wc == 0x1803 ||
+ wc == 0x1809 ||
+ (wc >= 0x1944 && wc <= 0x1945) ||
+ (wc >= 0x1AA8 && wc <= 0x1AAB) ||
+ (wc >= 0x1B5A && wc <= 0x1B5B) ||
+ (wc >= 0x1B5E && wc <= 0x1B5F) ||
+ (wc >= 0x1C3B && wc <= 0x1C3C) ||
+ (wc >= 0x1C7E && wc <= 0x1C7F) ||
+ (wc >= 0x203C && wc <= 0x203D) ||
+ (wc >= 0x2047 && wc <= 0x2049) ||
+ wc == 0x2E2E ||
+ wc == 0x2E3C ||
+ wc == 0x3002 ||
+ wc == 0xA4FF ||
+ (wc >= 0xA60E && wc <= 0xA60F) ||
+ wc == 0xA6F3 ||
+ wc == 0xA6F7 ||
+ (wc >= 0xA876 && wc <= 0xA877) ||
+ (wc >= 0xA8CE && wc <= 0xA8CF) ||
+ wc == 0xA92F ||
+ (wc >= 0xA9C8 && wc <= 0xA9C9) ||
+ (wc >= 0xAA5D && wc <= 0xAA5F) ||
+ (wc >= 0xAAF0 && wc <= 0xAAF1) ||
+ wc == 0xABEB ||
+ (wc >= 0xFE56 && wc <= 0xFE57) ||
+ wc == 0xFF01 ||
+ wc == 0xFF1F ||
+ wc == 0xFF61 ||
+ (wc >= 0x10A56 && wc <= 0x10A57) ||
+ (wc >= 0x11047 && wc <= 0x11048) ||
+ (wc >= 0x110BE && wc <= 0x110C1) ||
+ (wc >= 0x11141 && wc <= 0x11143) ||
+ (wc >= 0x111C5 && wc <= 0x111C6) ||
+ wc == 0x111CD ||
+ (wc >= 0x111DE && wc <= 0x111DF) ||
+ (wc >= 0x11238 && wc <= 0x11239) ||
+ (wc >= 0x1123B && wc <= 0x1123C) ||
+ wc == 0x112A9 ||
+ (wc >= 0x1144B && wc <= 0x1144C) ||
+ (wc >= 0x115C2 && wc <= 0x115C3) ||
+ (wc >= 0x115C9 && wc <= 0x115D7) ||
+ (wc >= 0x11641 && wc <= 0x11642) ||
+ (wc >= 0x1173C && wc <= 0x1173E) ||
+ (wc >= 0x11C41 && wc <= 0x11C42) ||
+ (wc >= 0x16A6E && wc <= 0x16A6F) ||
+ wc == 0x16AF5 ||
+ (wc >= 0x16B37 && wc <= 0x16B38) ||
+ wc == 0x16B44 ||
+ wc == 0x1BC9F ||
+ wc == 0x1DA88)
+ SB_type = SB_STerm;
+
+ break;
+ }
+
+ if (SB_type == SB_Other)
+ {
+ if (g_unichar_islower(wc))
+ SB_type = SB_Lower;
+ else if (g_unichar_isupper(wc))
+ SB_type = SB_Upper;
+ else if (g_unichar_isalpha(wc))
+ SB_type = SB_OLetter;
+
+ if (type == G_UNICODE_OPEN_PUNCTUATION ||
+ type == G_UNICODE_CLOSE_PUNCTUATION ||
+ break_type == G_UNICODE_BREAK_QUOTATION)
+ SB_type = SB_Close;
+ }
+
+ /* Sentence Boundary Rules */
+
+ /* We apply Rules SB1 and SB2 at the end of the function */
+
+#define IS_OTHER_TERM(SB_type) \
+ /* not in (OLetter | Upper | Lower | ParaSep | SATerm) */ \
+ !(SB_type == SB_OLetter || \
+ SB_type == SB_Upper || SB_type == SB_Lower || \
+ SB_type == SB_ParaSep || \
+ SB_type == SB_ATerm || SB_type == SB_STerm || \
+ SB_type == SB_ATerm_Close_Sp || \
+ SB_type == SB_STerm_Close_Sp)
+
+
+ if (wc == '\n' && prev_wc == '\r')
+ is_sentence_boundary = FALSE; /* Rule SB3 */
+ else if (prev_SB_type == SB_ParaSep && prev_SB_i + 1 == i)
+ {
+ /* The extra check for prev_SB_i is to correctly handle sequences like
+ * ParaSep ÷ Extend × Extend
+ * since we have not skipped ExtendFormat yet.
+ */
+
+ is_sentence_boundary = TRUE; /* Rule SB4 */
+ }
+ else if (SB_type == SB_ExtendFormat)
+ is_sentence_boundary = FALSE; /* Rule SB5? */
+ else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
+ is_sentence_boundary = FALSE; /* Rule SB6 */
+ else if ((prev_prev_SB_type == SB_Upper ||
+ prev_prev_SB_type == SB_Lower) &&
+ prev_SB_type == SB_ATerm &&
+ SB_type == SB_Upper)
+ is_sentence_boundary = FALSE; /* Rule SB7 */
+ else if (prev_SB_type == SB_ATerm && SB_type == SB_Close)
+ SB_type = SB_ATerm;
+ else if (prev_SB_type == SB_STerm && SB_type == SB_Close)
+ SB_type = SB_STerm;
+ else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp)
+ SB_type = SB_ATerm_Close_Sp;
+ else if (prev_SB_type == SB_STerm && SB_type == SB_Sp)
+ SB_type = SB_STerm_Close_Sp;
+ /* Rule SB8 */
+ else if ((prev_SB_type == SB_ATerm ||
+ prev_SB_type == SB_ATerm_Close_Sp) &&
+ SB_type == SB_Lower)
+ is_sentence_boundary = FALSE;
+ else if ((prev_prev_SB_type == SB_ATerm ||
+ prev_prev_SB_type == SB_ATerm_Close_Sp) &&
+ IS_OTHER_TERM(prev_SB_type) &&
+ SB_type == SB_Lower)
+ attrs[prev_SB_i].is_sentence_boundary = FALSE;
+ else if ((prev_SB_type == SB_ATerm ||
+ prev_SB_type == SB_ATerm_Close_Sp ||
+ prev_SB_type == SB_STerm ||
+ prev_SB_type == SB_STerm_Close_Sp) &&
+ (SB_type == SB_SContinue ||
+ SB_type == SB_ATerm || SB_type == SB_STerm))
+ is_sentence_boundary = FALSE; /* Rule SB8a */
+ else if ((prev_SB_type == SB_ATerm ||
+ prev_SB_type == SB_STerm) &&
+ (SB_type == SB_Close || SB_type == SB_Sp ||
+ SB_type == SB_ParaSep))
+ is_sentence_boundary = FALSE; /* Rule SB9 */
+ else if ((prev_SB_type == SB_ATerm ||
+ prev_SB_type == SB_ATerm_Close_Sp ||
+ prev_SB_type == SB_STerm ||
+ prev_SB_type == SB_STerm_Close_Sp) &&
+ (SB_type == SB_Sp || SB_type == SB_ParaSep))
+ is_sentence_boundary = FALSE; /* Rule SB10 */
+ else if ((prev_SB_type == SB_ATerm ||
+ prev_SB_type == SB_ATerm_Close_Sp ||
+ prev_SB_type == SB_STerm ||
+ prev_SB_type == SB_STerm_Close_Sp) &&
+ SB_type != SB_ParaSep)
+ is_sentence_boundary = TRUE; /* Rule SB11 */
+ else
+ is_sentence_boundary = FALSE; /* Rule SB998 */
+
+ if (SB_type != SB_ExtendFormat &&
+ !((prev_prev_SB_type == SB_ATerm ||
+ prev_prev_SB_type == SB_ATerm_Close_Sp) &&
+ IS_OTHER_TERM(prev_SB_type) &&
+ IS_OTHER_TERM(SB_type)))
+ {
+ prev_prev_SB_type = prev_SB_type;
+ prev_SB_type = SB_type;
+ prev_SB_i = i;
+ }
+
+#undef IS_OTHER_TERM
+
+ }
+
+ if (i == 0 || done)
+ is_sentence_boundary = TRUE; /* Rules SB1 and SB2 */
+
+ attrs[i].is_sentence_boundary = is_sentence_boundary;
+ }
/* ---- Line breaking ---- */
@@ -1371,424 +1636,20 @@ pango_default_break (const gchar *text,
/* ---- Sentence breaks ---- */
- /* The Unicode spec specifies sentence breakpoints, so that a piece of
- * text would be partitioned into sentences, and all characters would
- * be inside some sentence. This code implements that for is_sentence_boundary,
- * but tries to keep leading/trailing whitespace out of sentences for
- * the start/end flags
- */
-
- /* The Unicode spec seems to say that one trailing line/para
- * separator can be tacked on to a sentence ending in ! or ?,
- * but not a sentence ending in period; I think they're on crack
- * so am allowing one to be tacked onto a sentence ending in period.
- */
-
-#define MAYBE_START_NEW_SENTENCE \
- switch ((int) type) \
- { \
- case G_UNICODE_LINE_SEPARATOR: \
- case G_UNICODE_PARAGRAPH_SEPARATOR: \
- case G_UNICODE_CONTROL: \
- case G_UNICODE_FORMAT: \
- case G_UNICODE_SPACE_SEPARATOR: \
- sentence_state = STATE_SENTENCE_OUTSIDE; \
- break; \
- \
- default: \
- sentence_state = STATE_SENTENCE_BODY; \
- attrs[i].is_sentence_start = TRUE; \
- break; \
- }
-
- /* No sentence break at the start of the text */
-
- /* default to not a sentence breakpoint */
- attrs[i].is_sentence_boundary = FALSE;
+ /* default to not a sentence start/end */
attrs[i].is_sentence_start = FALSE;
attrs[i].is_sentence_end = FALSE;
- /* FIXME the Unicode spec lumps control/format chars with
- * line/para separators in descriptive text, but not in the
- * character class specs, in table 5-6, so who knows whether you
- * are actually supposed to break on control/format
- * characters. Seems semi-broken to break on tabs...
- */
-
- /* Break after line/para separators except carriage return
- * followed by newline
- */
- switch ((int) prev_type)
- {
- case G_UNICODE_LINE_SEPARATOR:
- case G_UNICODE_PARAGRAPH_SEPARATOR:
- case G_UNICODE_CONTROL:
- case G_UNICODE_FORMAT:
- if (wc == '\r')
- {
- if (next_wc != '\n')
- attrs[i].is_sentence_boundary = TRUE;
- }
- else
- attrs[i].is_sentence_boundary = TRUE;
- break;
-
- default:
- break;
- }
-
- /* break before para/line separators except newline following
- * carriage return
- */
- switch ((int) type)
- {
- case G_UNICODE_LINE_SEPARATOR:
- case G_UNICODE_PARAGRAPH_SEPARATOR:
- case G_UNICODE_CONTROL:
- case G_UNICODE_FORMAT:
- if (wc == '\n')
- {
- if (prev_wc != '\r')
- attrs[i].is_sentence_boundary = TRUE;
- }
- else
- attrs[i].is_sentence_boundary = TRUE;
- break;
-
- default:
- break;
- }
-
- switch (sentence_state)
- {
- case STATE_SENTENCE_OUTSIDE:
- /* Start sentence if we have non-whitespace/format/control */
- switch ((int) type)
- {
- case G_UNICODE_LINE_SEPARATOR:
- case G_UNICODE_PARAGRAPH_SEPARATOR:
- case G_UNICODE_CONTROL:
- case G_UNICODE_FORMAT:
- case G_UNICODE_SPACE_SEPARATOR:
- break;
-
- default:
- attrs[i].is_sentence_start = TRUE;
- sentence_state = STATE_SENTENCE_BODY;
- break;
- }
- break;
-
- case STATE_SENTENCE_BODY:
- /* If we already broke here due to separators, end the sentence. */
- if (attrs[i].is_sentence_boundary)
- {
- attrs[i].is_sentence_end = TRUE;
-
- MAYBE_START_NEW_SENTENCE;
- }
- else
- {
- if (wc == '.')
- sentence_state = STATE_SENTENCE_DOT;
- else if (wc == '?' || wc == '!')
- sentence_state = STATE_SENTENCE_TERM;
- }
- break;
-
- case STATE_SENTENCE_TERM:
- /* End sentence on anything but close punctuation and some
- * loosely-specified OTHER_PUNCTUATION such as period,
- * comma, etc.; follow Unicode rules for breaks
- */
- switch ((int) type)
- {
- case G_UNICODE_OTHER_PUNCTUATION:
- case G_UNICODE_CLOSE_PUNCTUATION:
- if (type == G_UNICODE_CLOSE_PUNCTUATION ||
- wc == '.' ||
- wc == ',' ||
- wc == '?' ||
- wc == '!')
- sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;
- else
- {
- attrs[i].is_sentence_end = TRUE;
- attrs[i].is_sentence_boundary = TRUE;
-
- MAYBE_START_NEW_SENTENCE;
- }
- break;
-
- case G_UNICODE_SPACE_SEPARATOR:
- attrs[i].is_sentence_end = TRUE;
- sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
- break;
-
- case G_UNICODE_LINE_SEPARATOR:
- case G_UNICODE_PARAGRAPH_SEPARATOR:
- attrs[i].is_sentence_end = TRUE;
- sentence_state = STATE_SENTENCE_POST_TERM_SEP;
- break;
-
- default:
- attrs[i].is_sentence_end = TRUE;
- attrs[i].is_sentence_boundary = TRUE;
-
- MAYBE_START_NEW_SENTENCE;
-
- break;
- }
- break;
-
- case STATE_SENTENCE_POST_TERM_CLOSE:
- /* End sentence on anything besides more punctuation; follow
- * rules for breaks
- */
- switch ((int) type)
- {
- case G_UNICODE_OTHER_PUNCTUATION:
- case G_UNICODE_CLOSE_PUNCTUATION:
- if (type == G_UNICODE_CLOSE_PUNCTUATION ||
- wc == '.' ||
- wc == ',' ||
- wc == '?' ||
- wc == '!')
- /* continue in this state */
- ;
- else
- {
- attrs[i].is_sentence_end = TRUE;
- attrs[i].is_sentence_boundary = TRUE;
-
- MAYBE_START_NEW_SENTENCE;
- }
- break;
-
- case G_UNICODE_SPACE_SEPARATOR:
- attrs[i].is_sentence_end = TRUE;
- sentence_state = STATE_SENTENCE_POST_TERM_SPACE;
- break;
-
- case G_UNICODE_LINE_SEPARATOR:
- case G_UNICODE_PARAGRAPH_SEPARATOR:
- attrs[i].is_sentence_end = TRUE;
- /* undo the unconditional break-at-all-line/para-separators
- * from above; I'm not sure this is what the Unicode spec
- * intends, but it seems right - we get to include
- * a single line/para separator in the sentence according
- * to their rules
- */
- attrs[i].is_sentence_boundary = FALSE;
- sentence_state = STATE_SENTENCE_POST_TERM_SEP;
- break;
-
- default:
- attrs[i].is_sentence_end = TRUE;
- attrs[i].is_sentence_boundary = TRUE;
-
- MAYBE_START_NEW_SENTENCE;
-
- break;
- }
- break;
-
- case STATE_SENTENCE_POST_TERM_SPACE:
-
- /* Sentence is definitely already ended; to enter this state
- * we had to see a space, which ends the sentence.
- */
-
- switch ((int) type)
- {
- case G_UNICODE_SPACE_SEPARATOR:
- /* continue in this state */
- break;
-
- case G_UNICODE_LINE_SEPARATOR:
- case G_UNICODE_PARAGRAPH_SEPARATOR:
- /* undo the unconditional break-at-all-line/para-separators
- * from above; I'm not sure this is what the Unicode spec
- * intends, but it seems right
- */
- attrs[i].is_sentence_boundary = FALSE;
- sentence_state = STATE_SENTENCE_POST_TERM_SEP;
- break;
-
- default:
- attrs[i].is_sentence_boundary = TRUE;
-
- MAYBE_START_NEW_SENTENCE;
-
- break;
- }
- break;
-
- case STATE_SENTENCE_POST_TERM_SEP:
- /* Break is forced at this point, unless we're a newline
- * after a CR, then we will break after the newline on the
- * next iteration. Only a single Sep can be in the
- * sentence.
- */
- if (!(prev_wc == '\r' && wc == '\n'))
- attrs[i].is_sentence_boundary = TRUE;
-
- MAYBE_START_NEW_SENTENCE;
-
- break;
-
- case STATE_SENTENCE_DOT:
- switch ((int) type)
- {
- case G_UNICODE_CLOSE_PUNCTUATION:
- sentence_state = STATE_SENTENCE_POST_DOT_CLOSE;
- break;
-
- case G_UNICODE_SPACE_SEPARATOR:
- possible_sentence_end = i;
- sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
- break;
-
- default:
- /* If we broke on a control/format char, end the
- * sentence; else this was not a sentence end, since
- * we didn't enter the POST_DOT_SPACE state.
- */
- if (attrs[i].is_sentence_boundary)
- {
- attrs[i].is_sentence_end = TRUE;
-
- MAYBE_START_NEW_SENTENCE;
- }
- else
- sentence_state = STATE_SENTENCE_BODY;
- break;
- }
- break;
-
- case STATE_SENTENCE_POST_DOT_CLOSE:
- switch ((int) type)
- {
- case G_UNICODE_SPACE_SEPARATOR:
- possible_sentence_end = i;
- sentence_state = STATE_SENTENCE_POST_DOT_SPACE;
- break;
-
- default:
- /* If we broke on a control/format char, end the
- * sentence; else this was not a sentence end, since
- * we didn't enter the POST_DOT_SPACE state.
- */
- if (attrs[i].is_sentence_boundary)
- {
- attrs[i].is_sentence_end = TRUE;
-
- MAYBE_START_NEW_SENTENCE;
- }
- else
- sentence_state = STATE_SENTENCE_BODY;
- break;
- }
- break;
-
- case STATE_SENTENCE_POST_DOT_SPACE:
-
- possible_sentence_boundary = i;
-
- switch ((int) type)
- {
- case G_UNICODE_SPACE_SEPARATOR:
- /* remain in current state */
- break;
-
- case G_UNICODE_OPEN_PUNCTUATION:
- sentence_state = STATE_SENTENCE_POST_DOT_OPEN;
- break;
-
- case G_UNICODE_LOWERCASE_LETTER:
- /* wasn't a sentence-ending period; so re-enter the sentence
- * body
- */
- sentence_state = STATE_SENTENCE_BODY;
- break;
-
- default:
- /* End the sentence, break, maybe start a new one */
-
- g_assert (possible_sentence_end >= 0);
- g_assert (possible_sentence_boundary >= 0);
-
- attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
- attrs[possible_sentence_end].is_sentence_end = TRUE;
-
- possible_sentence_end = -1;
- possible_sentence_boundary = -1;
-
- MAYBE_START_NEW_SENTENCE;
-
- break;
- }
- break;
-
- case STATE_SENTENCE_POST_DOT_OPEN:
- switch ((int) type)
- {
- case G_UNICODE_OPEN_PUNCTUATION:
- /* continue in current state */
- break;
-
- case G_UNICODE_LOWERCASE_LETTER:
- /* wasn't a sentence-ending period; so re-enter the sentence
- * body
- */
- sentence_state = STATE_SENTENCE_BODY;
- break;
-
- default:
- /* End the sentence, break, maybe start a new one */
-
- g_assert (possible_sentence_end >= 0);
- g_assert (possible_sentence_boundary >= 0);
-
- attrs[possible_sentence_boundary].is_sentence_boundary = TRUE;
- attrs[possible_sentence_end].is_sentence_end = TRUE;
-
- possible_sentence_end = -1;
- possible_sentence_boundary = -1;
-
- MAYBE_START_NEW_SENTENCE;
-
- break;
- }
- break;
-
- case STATE_SENTENCE_POST_DOT_SEP:
- /* Break is forced at this point, unless we're a newline
- * after a CR, then we will break after the newline on the
- * next iteration. Only a single Sep can be in the
- * sentence.
- */
- if (!(prev_wc == '\r' && wc == '\n'))
- attrs[i].is_sentence_boundary = TRUE;
-
- g_assert (possible_sentence_end >= 0);
- g_assert (possible_sentence_boundary >= 0);
-
- attrs[possible_sentence_end].is_sentence_end = TRUE;
-
- possible_sentence_end = -1;
- possible_sentence_boundary = -1;
-
- MAYBE_START_NEW_SENTENCE;
-
- break;
+ if (last_sentence_start == -1 && !is_sentence_boundary) {
+ last_sentence_start = i - 1;
+ attrs[i - 1].is_sentence_start = TRUE;
+ }
- default:
- g_assert_not_reached ();
- break;
- }
+ if (last_sentence_start != -1 && is_sentence_boundary) {
+ last_sentence_start = -1;
+ attrs[i].is_sentence_end = TRUE;
+ }
- prev_type = type;
prev_wc = wc;
/* wc might not be a valid Unicode base character, but really all we
@@ -1798,6 +1659,7 @@ pango_default_break (const gchar *text,
type != G_UNICODE_NON_SPACING_MARK)
base_character = wc;
}
+
i--;
attrs[i].is_cursor_position = TRUE; /* Rule GB2 */
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]