[pan2: 3/23] another url update.



commit 4512976efbca96a6127b8c88991dd39b5f10f1f7
Author: K. Haley <haleykd users sf net>
Date:   Sat Feb 12 01:15:52 2011 -0700

    another url update.

 pan/usenet-utils/url-find-test.cc |    3 +++
 pan/usenet-utils/url-find.cc      |   23 ++++++++++++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)
---
diff --git a/pan/usenet-utils/url-find-test.cc b/pan/usenet-utils/url-find-test.cc
index 076213a..62ac3bf 100644
--- a/pan/usenet-utils/url-find-test.cc
+++ b/pan/usenet-utils/url-find-test.cc
@@ -74,6 +74,9 @@ main (void)
   check (url_find (in, out))
   check (out == "lost foo_bar rebelbase com")
 
+  in = "A URl 'http://www.www.com'?";
+  check (url_find (in, out))
+  check (out == "http://www.www.com";)
 
   // success
   return 0;
diff --git a/pan/usenet-utils/url-find.cc b/pan/usenet-utils/url-find.cc
index 2dcabde..8034839 100644
--- a/pan/usenet-utils/url-find.cc
+++ b/pan/usenet-utils/url-find.cc
@@ -28,12 +28,19 @@ namespace {
     public:
       fooregex(): regex(NULL)
       {
-        regex = g_regex_new("(?:https?://|"
-          "ftps?(?:://|\\.)|" //ftp:// ftp.
-          "news:|"
-          "www\\.|"
-          "[[:alnum:]][[:alnum:]_\\.]*@)" //email
-          "[[:alnum:]\\.:/?#\\[\\] !$&'()*+,;=\\-_%~]+" /* uri */,
+        // RFC1738
+        // unsafe in URL (always encoded):  {}|\^~[]`<>"
+        // reserved for schemas: ;/?:@=&
+        // % (hex encoding) # (fragment)
+        // allowed: a-z A-Z 0-9 $-_.+!*'(),
+        regex = g_regex_new("(?:"
+            "https?://|"
+            "ftps?(?:://|\\.)|" //ftp:// ftp.
+            "news:|nntp:|"
+            "www\\.|"
+            "[[:alnum:]][[:alnum:]_\\.]*@" //email
+          ")"
+          "[" "[:alnum:]$_\\-\\.!+*()',%#" ";:/?&=@" "]+" /* uri */,
           G_REGEX_OPTIMIZE, (GRegexMatchFlags)0, NULL);
       }
       ~fooregex()
@@ -68,7 +75,9 @@ pan :: url_find (const StringView& text, StringView& setme_url)
   // for urls at the end of a sentence.
   if (!setme_url.empty() && strchr("?!.,", setme_url.back()))
     --setme_url.len;
-
+  const char c = text.str[ start - 1 ];
+  if (c == '\'' && c == setme_url.back() )
+    --setme_url.len;
   return true;
 }
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]