textchain backend



Here's a patch to try for backend-textchain.cs:

* new phone regexp: captures most international numbers as well as US
numbers.  The downside is that it sometimes lets other stuff through by
accident (e.g. 2003-07-21 would make it through).  Backends really need
to normalise before comparing as well (my concept of this: strip out all
but digits and the "+" international introducer.)

* URL normalisation. puts the 'http://' and 'ftp://' back on the front
of URLs inferred from a string beginning 'www.' or 'ftp.'.

-- Edd

(BTW I'm happy to check in these patches myself, I just figured I might
be wise to send them to the list for review before doing so.)
Index: backends/backend-textchain.cs
===================================================================
RCS file: /cvs/gnome/dashboard/backends/backend-textchain.cs,v
retrieving revision 1.2
diff -u -r1.2 backend-textchain.cs
--- backends/backend-textchain.cs	19 Jul 2003 13:33:45 -0000	1.2
+++ backends/backend-textchain.cs	21 Jul 2003 11:16:17 -0000
@@ -19,10 +19,17 @@
 
 		private const string REGEX_URL
 			= "((((http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+(:[0-9]*)?(/)?)";
+
+		// this regex found at http://www.dotnetforums.net/t49640.html
 		private const string REGEX_EMAIL
-			= "(\\S+ \\S+)";
+			= "([\\w\\ \\-]+ [a-zA-Z0-9\\-]+(\\.[a-zA-Z0-9\\-]{1,})*(\\.[a-zA-Z]{2,3}){1,2})";
+
 		private const string REGEX_PHONE_US
 			= "(\\(?[2-9][0-9]{2}\\)?[-. ][1-9][0-9]{2}[-. ][0-9]{4})";
+
+		private const string REGEX_PHONE
+			= "([\\+\\(]?\\d(\\d|\\)|[\\-\\(\\. ]\\d){6,})[\\s$]";
+
 		private const string REGEX_BUGZILLA
 			= "#([1-9][0-9]*)";
 		
@@ -38,15 +45,43 @@
 		public override BackendResult ProcessCluePacket (CluePacket cp)
 		{
 			ArrayList newclues = new ArrayList ();
+			ArrayList urlclues = new ArrayList ();
 
 			foreach (Clue clue in cp.Clues) {
 
 				if (clue.Type != "textblock")
 					continue;
 
-				newclues.AddRange (this.GetPatternMatches (clue, "url",      REGEX_URL));
+
+				// we normalize url clues to have their
+				// URI schemes on the front
+				
+				urlclues.AddRange 
+					(this.GetPatternMatches (clue, "url", REGEX_URL));
+
+				foreach (Clue c in urlclues) {
+					Clue r=null;
+
+					if (String.Compare 
+							(c.Text, 0, "www.", 0, 4, true) == 0) {
+						r=new Clue (c.Type, 
+								"http://"; + c.Text,
+								c.Relevance);
+					} else if (String.Compare 
+							(c.Text, 0, "ftp.", 0, 4, true) == 0) {
+						r=new Clue (c.Type,
+								"ftp://"; + c.Text,
+								c.Relevance);
+					}
+					if (r == null) {
+						newclues.Add (c);
+					} else {
+						newclues.Add (r);
+					}
+				}
+
 				newclues.AddRange (this.GetPatternMatches (clue, "email",    REGEX_EMAIL));
-				newclues.AddRange (this.GetPatternMatches (clue, "phone",    REGEX_PHONE_US));
+				newclues.AddRange (this.GetPatternMatches (clue, "phone",    REGEX_PHONE));
 				newclues.AddRange (this.GetPatternMatches (clue, "bugzilla", REGEX_BUGZILLA));
 			}
 				


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]