[pan2: 1/23] change URI detection



commit 4bf51cfd08140377868bbcbb0ef8ae6d61f6cfc7
Author: K. Haley <haleykd users sf net>
Date:   Wed Feb 9 00:04:41 2011 -0700

    change URI detection

 pan/usenet-utils/url-find-test.cc |   16 ++++++----
 pan/usenet-utils/url-find.cc      |   56 +++++++++++++++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 8 deletions(-)
---
diff --git a/pan/usenet-utils/url-find-test.cc b/pan/usenet-utils/url-find-test.cc
index a8db50c..076213a 100644
--- a/pan/usenet-utils/url-find-test.cc
+++ b/pan/usenet-utils/url-find-test.cc
@@ -23,9 +23,9 @@ main (void)
   check (url_find (in, out))
   check (out == "http://www.gtk.org/";)
 
-  in = "Have you ever visited http://www.google.com/?";;
+  in = "Have you ever visited https://www.google.com/?";;
   check (url_find (in, out))
-  check (out == "http://www.google.com/";)
+  check (out == "https://www.google.com/";)
 
   in = "la la lawww.google.com sfadf";
   check (url_find (in, out))
@@ -35,9 +35,9 @@ main (void)
   check (url_find (in, out))
   check (out == "charles rebelbase com")
 
-  in = "Go visit ftp://ftp.gnome.org and get cool software!";
+  in = "Go visit ftps://ftp.gnome.org and get cool software!";
   check (url_find (in, out))
-  check (out == "ftp://ftp.gnome.org";)
+  check (out == "ftps://ftp.gnome.org")
 
   in = "Go visit ftp.gnome.org and get cool software!";
   check (url_find (in, out))
@@ -66,9 +66,13 @@ main (void)
   check (url_find (in, out))
   check (out == "charles rebelbase com")
 
-  in = "blah blah <http://www.dobreprogramy.pl/pirat/dobreprogramy_pl(piratXXX).jpg>, only the";
+  in = "blah blah <http://www.dobreprogramy.pl/pirat/dobreprogramy_pl(piratXXX).jpg#frag>, only the";
   check (url_find (in, out))
-  check (out == "http://www.dobreprogramy.pl/pirat/dobreprogramy_pl(piratXXX).jpg")
+  check (out == "http://www.dobreprogramy.pl/pirat/dobreprogramy_pl(piratXXX).jpg#frag")
+
+  in = "Here is my email address: lost foo_bar rebelbase com  Did you get it?";
+  check (url_find (in, out))
+  check (out == "lost foo_bar rebelbase com")
 
 
   // success
diff --git a/pan/usenet-utils/url-find.cc b/pan/usenet-utils/url-find.cc
index b33d6c7..2dcabde 100644
--- a/pan/usenet-utils/url-find.cc
+++ b/pan/usenet-utils/url-find.cc
@@ -23,11 +23,63 @@
 #include "url-find.h"
 
 using namespace pan;
+namespace {
+  class fooregex {
+    public:
+      fooregex(): regex(NULL)
+      {
+        regex = g_regex_new("(?:https?://|"
+          "ftps?(?:://|\\.)|" //ftp:// ftp.
+          "news:|"
+          "www\\.|"
+          "[[:alnum:]][[:alnum:]_\\.]*@)" //email
+          "[[:alnum:]\\.:/?#\\[\\] !$&'()*+,;=\\-_%~]+" /* uri */,
+          G_REGEX_OPTIMIZE, (GRegexMatchFlags)0, NULL);
+      }
+      ~fooregex()
+      {
+        g_regex_unref(regex);
+      }
+      operator GRegex*() {return regex;}
+
+      GRegex *regex;
+  };
+
+  fooregex regex;
+};
+
+bool
+pan :: url_find (const StringView& text, StringView& setme_url)
+{
+  if (text.empty())
+    return false;
+
+  GMatchInfo *match;
+
+  if (!g_regex_match(regex, text.str, (GRegexMatchFlags)0, &match))
+    return false;
+
+  int start,end;
+
+  g_match_info_fetch_pos(match, 0, &start, &end);
+  g_match_info_free(match);
+  setme_url.assign(text.str+start, end - start);
+
+  // for urls at the end of a sentence.
+  if (!setme_url.empty() && strchr("?!.,", setme_url.back()))
+    --setme_url.len;
+
+  return true;
+}
 
 // This is a cheap little hack that should eventually be replaced
 // with something more robust.
+namespace pan {
+bool url_findx (const StringView& text, StringView& setme_url);
+}
+
 bool
-pan :: url_find (const StringView& text, StringView& setme_url)
+pan :: url_findx (const StringView& text, StringView& setme_url)
 {
   if (text.empty())
     return false;
@@ -80,7 +132,7 @@ pan :: url_find (const StringView& text, StringView& setme_url)
     char ch (start[-1]);
     if (ch == '[') bracket = ']';
     else if (ch == '<') bracket = '>';
-  } 
+  }
 
   const char * pch;
   for (pch=start; pch!=text.end(); ++pch) {



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]