add date parser to TextChainerBackend.cs



Attached is a small patch to parse dates from text. It tries to deal
sensibly with ambiguous dates, and copes reasonably well with a variety
of formats.

cheers,

dave


--- dashboard/backends/TextChainerBackend.cs	2004-03-06 00:00:32.572809844 +0000
+++ TextChainerBackend-date.cs	2004-03-05 23:59:59.875394530 +0000
@@ -38,6 +38,18 @@
 
 		private const string REGEX_BUGZILLA
 			= "#([1-9][0-9]*)";
+
+		// some regexes to match dates. these probably won't match all dates,
+		// or non-english dates, but it's a start.
+		private const string MONTHS
+			= "(?<month>(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\\w*)";
+		private const string REGEX_DATE1
+			= "((?<day>\\d{1,4})(\\-|\\/)(?<month>\\d{1,2})(\\-|\\/)(?<year>\\d{1,4}))";
+		private const string REGEX_DATE2
+			= "((?<day>\\d{1,2})\\w*\\s"+MONTHS+"\\s((?<year>\\d{2}){1,2}))";
+		private const string REGEX_DATE3
+			= "("+MONTHS+"\\s(?<day>\\d{1,2})[\\s,]+(\\d\\d:{0,1}){0,3}\\D*(?<year>\\d{4}))";
+
 		
 		public override bool Startup ()
 		{
@@ -131,6 +143,9 @@
 				newclues.AddRange (GetPatternMatches (clue, "email",    REGEX_EMAIL));
 				newclues.AddRange (GetPatternMatches (clue, "phone",    REGEX_PHONE));
 				newclues.AddRange (GetPatternMatches (clue, "bugzilla", REGEX_BUGZILLA));
+				newclues.AddRange (GetPatternMatches (clue, "date", 	REGEX_DATE1));
+				newclues.AddRange (GetPatternMatches (clue, "date", 	REGEX_DATE2));
+				newclues.AddRange (GetPatternMatches (clue, "date", 	REGEX_DATE3));
 
 				// convert htmlblocks to textblocks
 				ConvertHtmlToText (cp, newclues);
@@ -142,6 +157,62 @@
 			return result;
 		}
 
+		// convert dates to YYYY-MM-DD format. handle ambiguous dates by generating additional datestrings
+		private String [] NormalizeDateString (Regex rx, System.Text.RegularExpressions.Match m) {
+			String day   = m.Groups["day"].Value;
+			String month = m.Groups["month"].Value;
+			String year  = m.Groups["year"].Value;
+			String [] results = new String[2];
+
+			// swap day & year if they're (unambiguously) the wrong way round
+			// useful for dealing with e.g. 02/05/78 and 78/05/02
+			if (((day.Length == 4) && (year.Length <= 2)) ||
+				((System.Convert.ToInt32 (day) > 31) && (System.Convert.ToInt32 (year) <= 31)) ||
+				((year.Length == 1) && (day.Length == 2)) ) {
+				day  = year;
+				year = m.Groups["day"].Value;
+			}
+			
+			// convert month to a number
+			if (month.Length > 3)
+				month = month.Substring (0, 3).ToLower ();
+			if (month == "jan") month = "01";
+			if (month == "feb") month = "02";
+			if (month == "mar") month = "03";
+			if (month == "apr") month = "04";
+			if (month == "may") month = "05";
+			if (month == "jun") month = "06";
+			if (month == "jul") month = "07";
+			if (month == "aug") month = "08";
+			if (month == "sep") month = "09";
+			if (month == "oct") month = "10";
+			if (month == "nov") month = "11";
+			if (month == "dec") month = "12";
+
+			if (year.Length == 2 && System.Convert.ToInt32 (year) <  50)  year  = "20" + year; // totally arbitrary.
+			if (year.Length == 2 && System.Convert.ToInt32 (year) >= 50)  year  = "19" + year; // oh well.
+			if (month.Length == 1) month = "0" + month;
+			if (day.Length == 1)   day   = "0" + day;
+
+			results[0] = year + "-" + month + "-" + day;
+			results[1] = null;
+		
+			// handle dates like 05/02/1994 - could be 5th Feb or 2nd May 1994
+			// if ambiguous generate a second date clue
+			if (rx.ToString () == REGEX_DATE1)
+				if ((System.Convert.ToInt32 (day) <= 12) && (System.Convert.ToInt32 (month) <= 12))
+					results[1] = year + "-" + day + "-" + month;
+
+			Console.WriteLine("--------------------------");
+			Console.WriteLine("{0} -> {1}", m.Groups[1].ToString (), results[0]);
+			if (results[1] != null)
+			Console.WriteLine("{0} -> {1}", m.Groups[1].ToString (), results[1]);
+			Console.WriteLine("--------------------------");
+
+			return results;
+		}
+
+
 		private ArrayList GetPatternMatches (Clue clue, string type, string regex)
 		{
 			Regex r;
@@ -152,7 +223,13 @@
 			ArrayList newclues = new ArrayList ();
 			for (m = r.Match (clue.Text); m.Success; m = m.NextMatch ()) {
 				Console.WriteLine ("TextChainer chaining to type " + type + ": " + m.Groups[1].ToString ());
-				newclues.Add (new Clue (type, m.Groups[1].ToString (), 10, clue));
+				if (type == "date")
+					foreach (String d in NormalizeDateString (r, m)) {
+						if (d != null)
+							newclues.Add (new Clue (type, d, 10, clue));
+					}
+				else
+					newclues.Add (new Clue (type, m.Groups[1].ToString (), 10, clue));
 			}
 
 			return newclues;


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]