[gtksourceview] html.lang: Update tag and attribute name regular expressions



commit 5385eb27720875bc144d54ccbd00d1613c41f7e0
Author: Jeffery To <jeffery to gmail com>
Date:   Mon Nov 4 04:28:45 2019 +0800

    html.lang: Update tag and attribute name regular expressions
    
    This updates the regular expressions for tag and attribute names,
    following the 12.2 Parsing HTML documents[1] section of the HTML Living
    Standard, specifically:
    
    * 12.2.3.5 Preprocessing the input stream
    * 12.2.5.6 Tag open state
    * 12.2.5.8 Tag name state
    * 12.2.5.32 Before attribute name state
    * 12.2.5.33 Attribute name state
    
    These characters are flagged as parse errors during preprocessing:
    
    * Surrogates[2]
    * Noncharacters[3]
    * Controls[4]
    
    (Including surrogate characters in GRegex regular expressions leads to
    compilation errors "disallowed Unicode code point (>= 0xd800 && <=
    0xdfff)", so they are not included/checked.)
    
    Null characters are also flagged as errors later in the parsing process.
    Other characters (whitespace, "/", ">", etc.) trigger state changes and
    so cannot be part of the tag / attribute name.
    
    Fixes #87.
    
    [1]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
    [2]: https://infra.spec.whatwg.org/#surrogate
    [3]: https://infra.spec.whatwg.org/#noncharacter
    [4]: https://infra.spec.whatwg.org/#control

 data/language-specs/html.lang       | 4 ++--
 tests/syntax-highlighting/file.html | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)
---
diff --git a/data/language-specs/html.lang b/data/language-specs/html.lang
index 5fa31275..5666d714 100644
--- a/data/language-specs/html.lang
+++ b/data/language-specs/html.lang
@@ -248,7 +248,7 @@
       </include>
     </context>
 
-    <define-regex id="attribute-name">[a-z0-9:_-]+</define-regex>
+    <define-regex id="attribute-name">[^\x00\t\n\f\r 
"'/&lt;=&gt;\x{007F}-\x{009F}\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]+</define-regex>
 
     <context id="generic-tag">
       <include>
@@ -350,7 +350,7 @@
     <replace id="js:embedded-lang-hooks" ref="js-embedded-lang-hooks"/>
 
     <context id="tag" class="no-spell-check">
-      <start>&lt;/?[a-z0-9_-]+</start>
+      <start>&lt;/?[a-z][^\x00\t\n\f\r 
/&gt;\x{007F}-\x{009F}\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]*</start>
       <end>/?&gt;</end>
       <include>
         <context sub-pattern="0" where="start" style-ref="tag"/>
diff --git a/tests/syntax-highlighting/file.html b/tests/syntax-highlighting/file.html
index 9cfcedd5..2f499b6d 100644
--- a/tests/syntax-highlighting/file.html
+++ b/tests/syntax-highlighting/file.html
@@ -22,5 +22,6 @@
        <span>Hi there!</span>
     </h1>
     Hi there!
+    <ex-元素 data-屬性="test">...</ex-元素>
   </body>
 </html>


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]