[pan2: 49/68] Minor updates to the regexes.



commit 592421b9f12c2053f5cbf48abf49ff420c0838f4
Author: K. Haley <haleykd users sf net>
Date:   Sun Aug 8 14:13:42 2010 -0600

    Minor updates to the regexes.

 pan/usenet-utils/text-massager-test.cc |   13 ++++++++-----
 pan/usenet-utils/text-massager.cc      |   26 +++++++++++++-------------
 2 files changed, 21 insertions(+), 18 deletions(-)
---
diff --git a/pan/usenet-utils/text-massager-test.cc b/pan/usenet-utils/text-massager-test.cc
index 62dbffd..796ecf9 100644
--- a/pan/usenet-utils/text-massager-test.cc
+++ b/pan/usenet-utils/text-massager-test.cc
@@ -32,7 +32,6 @@ int main (void)
    in = "> a\n> b\n> c";
    out = tm.fill (in);
    expected_out = "> a\n> b\n> c";
-   std::cout<<out<<"\n---\n"<<expected_out<<std::endl;
    check (out == expected_out);
 
    /* wrap real-world 1 */
@@ -214,7 +213,6 @@ int main (void)
    check (out == expected_out);
 
    // mute quoted test: realworld 2
-   check (out == expected_out);
    in =
 "In article <bl0D6 3171$Uo2 75315 zwoll1 home nl>, \"Marcel Pol\"\n"
 "<mpol nospam gmx net> wrote:\n"
@@ -264,7 +262,7 @@ int main (void)
 
    const char *in2, *sep="_";
    in2 = "prefix - one ...__   - two - three";
-   expected_out = "prefix_-_one_-_two_-_three";
+   expected_out = "prefix_one_two_three";
    out = pan::subject_to_path(in2, sep);
    //std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
    check(out == expected_out);
@@ -309,7 +307,7 @@ int main (void)
    //std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
    check(out == expected_out);
    in2 = "[ASDF-FDSE]  Name1 & Name2 - Spettertje - 01 title here  (thx AntA)  Post 6_6 - File 9_9 - aaspettertje01.sfv (1/1)";
-   expected_out = "[ASDF-FDSE]_Name1_&_Name2_-_Spettertje_-_01_title_here_(thx_AntA)";
+   expected_out = "[ASDF-FDSE]_Name1_&_Name2_Spettertje_01_title_here_(thx_AntA)";
    out = pan::subject_to_path(in2, sep);
    //std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
    check(out == expected_out);
@@ -329,7 +327,12 @@ int main (void)
    //std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
    check(out == expected_out);
    in2 = "one - two three [1/2] - \"00 - title spaces.foo\" yEnc (1/5)";
-   expected_out = "one_-_two_three";
+   expected_out = "one_two_three";
+   out = pan::subject_to_path(in2, sep);
+   //std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
+   check(out == expected_out);
+   in2 = "one - two three [1/2] - \"00 - title spaces.foo\" (/5)";
+   expected_out = "one_two_three";
    out = pan::subject_to_path(in2, sep);
    //std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
    check(out == expected_out);
diff --git a/pan/usenet-utils/text-massager.cc b/pan/usenet-utils/text-massager.cc
index 30203a7..2f1997c 100644
--- a/pan/usenet-utils/text-massager.cc
+++ b/pan/usenet-utils/text-massager.cc
@@ -381,45 +381,45 @@ pan :: subject_to_path (const char * subjectline, const std::string &seperator)
   }
 
   // strip out newspost/Xnews-style multi-part strings
-  GRegex *mp1 =g_regex_new("\\s*(?:[Ff]ile|[Pp]ost) [0-9]+ *(?:of|_) *[0-9]+[: ]?", cf0, mf0, NULL);
+  GRegex *mp1 =g_regex_new("\\s*(?:[Ff]ile|[Pp]ost)\\s[0-9]+\\s*(?:of|_)\\s*[0-9]+[:\\s]?", cf0, mf0, NULL);
   str1 = g_regex_replace_literal(mp1, val.c_str(), -1, 0, " ", mf0, NULL);
   g_regex_unref(mp1);
 
-  // and the rest
-  GRegex *mp2 =g_regex_new("\\s*[\[(]?[0-9]+\\s*(?:of|/)\\s*[0-9]+.", cf0, mf0, NULL);
+  // and the rest.  the last check is for pans collapsed part count
+  GRegex *mp2 =g_regex_new("\\s*([\[(]?'?[0-9]+'?\\s*(?:of|/)\\s*'?[0-9]+'?.)|\\(/[0-9]+\\)", cf0, mf0, NULL);
   str2 = g_regex_replace_literal(mp2, str1, -1, 0, "", mf0, NULL);
   g_free(str1);
   g_regex_unref(mp2);
 
   // try to strip out the filename (may fail if it contains spaces)
-  GRegex *fn =g_regex_new("\"[^\"]+?\" yEnc.*" "|"
-                          "\\S++\\s++yEnc.*" "|"
+  GRegex *fn =g_regex_new("\"[^\"]+?\" yEnc.*"    "|"
+                          "\\S++\\s++yEnc.*"      "|"
                           "\"[^\"]+?\\.\\w{2,}\"" "|"
-                          "\\S+\\.\\w{2,}", cf0, mf0, NULL);
+                          "\\S+\\.\\w{3,4}", cf0, mf0, NULL);
   str1 = g_regex_replace_literal(fn, str2, -1, 0, "", mf0, NULL);
   g_free(str2);
   g_regex_unref(fn);
 
   // try to strip out any byte counts
-  GRegex *cnt =g_regex_new("\\[?[0-9]+ *(?:[Bb]ytes|[Kk][Bb]?)\\]?", cf0, mf0, NULL);
+  GRegex *cnt =g_regex_new("\\[?[0-9]+\\s*(?:[Bb](ytes)?|[Kk][Bb]?)\\]?", cf0, mf0, NULL);
   str2 = g_regex_replace_literal(cnt, str1, -1, 0, "", mf0, NULL);
   g_free(str1);
   g_regex_unref(cnt);
 
   // remove any illegal / annoying characters
-  GRegex *badc =g_regex_new("[\\\\/<>|*?'\"]+", cf0, mf0, NULL);
-  str1 = g_regex_replace_literal(badc, str2, -1, 0, "_", mf0, NULL);
+  GRegex *badc =g_regex_new("[\\\\/<>|*?'\"\\.\\s]+", cf0, mf0, NULL);
+  str1 = g_regex_replace_literal(badc, str2, -1, 0, sep, mf0, NULL);
   g_free(str2);
   g_regex_unref(badc);
 
-  // remove any extraneous whitespace, '_', and '.'
-  GRegex *ext =g_regex_new("[\\s_\\.]+", cf0, mf0, NULL);
+  // remove any extraneous whitespace, '_', & '-'
+  GRegex *ext =g_regex_new("[\\s_-]{2,}", cf0, mf0, NULL);
   str2 = g_regex_replace_literal(ext, str1, -1, 0, sep, mf0, NULL);
   g_free(str1);
   g_regex_unref(ext);
 
-  // remove trailing junk
-  ext =g_regex_new("[_-]+$", cf0, mf0, NULL);
+  // remove leading & trailing junk
+  ext =g_regex_new("(^[\\s_-]+)|([\\s_-]+$)", cf0, mf0, NULL);
   str1 = g_regex_replace_literal(ext, str2, -1, 0, "", mf0, NULL);
   g_free(str2);
   g_regex_unref(ext);



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]