[pan2: 49/68] Minor updates to the regexes.
- From: Petr Kovář <pmkovar src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [pan2: 49/68] Minor updates to the regexes.
- Date: Tue, 8 Feb 2011 23:02:08 +0000 (UTC)
commit 592421b9f12c2053f5cbf48abf49ff420c0838f4
Author: K. Haley <haleykd users sf net>
Date: Sun Aug 8 14:13:42 2010 -0600
Minor updates to the regexes.
pan/usenet-utils/text-massager-test.cc | 13 ++++++++-----
pan/usenet-utils/text-massager.cc | 26 +++++++++++++-------------
2 files changed, 21 insertions(+), 18 deletions(-)
---
diff --git a/pan/usenet-utils/text-massager-test.cc b/pan/usenet-utils/text-massager-test.cc
index 62dbffd..796ecf9 100644
--- a/pan/usenet-utils/text-massager-test.cc
+++ b/pan/usenet-utils/text-massager-test.cc
@@ -32,7 +32,6 @@ int main (void)
in = "> a\n> b\n> c";
out = tm.fill (in);
expected_out = "> a\n> b\n> c";
- std::cout<<out<<"\n---\n"<<expected_out<<std::endl;
check (out == expected_out);
/* wrap real-world 1 */
@@ -214,7 +213,6 @@ int main (void)
check (out == expected_out);
// mute quoted test: realworld 2
- check (out == expected_out);
in =
"In article <bl0D6 3171$Uo2 75315 zwoll1 home nl>, \"Marcel Pol\"\n"
"<mpol nospam gmx net> wrote:\n"
@@ -264,7 +262,7 @@ int main (void)
const char *in2, *sep="_";
in2 = "prefix - one ...__ - two - three";
- expected_out = "prefix_-_one_-_two_-_three";
+ expected_out = "prefix_one_two_three";
out = pan::subject_to_path(in2, sep);
//std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
check(out == expected_out);
@@ -309,7 +307,7 @@ int main (void)
//std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
check(out == expected_out);
in2 = "[ASDF-FDSE] Name1 & Name2 - Spettertje - 01 title here (thx AntA) Post 6_6 - File 9_9 - aaspettertje01.sfv (1/1)";
- expected_out = "[ASDF-FDSE]_Name1_&_Name2_-_Spettertje_-_01_title_here_(thx_AntA)";
+ expected_out = "[ASDF-FDSE]_Name1_&_Name2_Spettertje_01_title_here_(thx_AntA)";
out = pan::subject_to_path(in2, sep);
//std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
check(out == expected_out);
@@ -329,7 +327,12 @@ int main (void)
//std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
check(out == expected_out);
in2 = "one - two three [1/2] - \"00 - title spaces.foo\" yEnc (1/5)";
- expected_out = "one_-_two_three";
+ expected_out = "one_two_three";
+ out = pan::subject_to_path(in2, sep);
+ //std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
+ check(out == expected_out);
+ in2 = "one - two three [1/2] - \"00 - title spaces.foo\" (/5)";
+ expected_out = "one_two_three";
out = pan::subject_to_path(in2, sep);
//std::cout<<"input: '"<<in2<<"'\noutput: '"<<out<<"'\n"<<std::endl;
check(out == expected_out);
diff --git a/pan/usenet-utils/text-massager.cc b/pan/usenet-utils/text-massager.cc
index 30203a7..2f1997c 100644
--- a/pan/usenet-utils/text-massager.cc
+++ b/pan/usenet-utils/text-massager.cc
@@ -381,45 +381,45 @@ pan :: subject_to_path (const char * subjectline, const std::string &seperator)
}
// strip out newspost/Xnews-style multi-part strings
- GRegex *mp1 =g_regex_new("\\s*(?:[Ff]ile|[Pp]ost) [0-9]+ *(?:of|_) *[0-9]+[: ]?", cf0, mf0, NULL);
+ GRegex *mp1 =g_regex_new("\\s*(?:[Ff]ile|[Pp]ost)\\s[0-9]+\\s*(?:of|_)\\s*[0-9]+[:\\s]?", cf0, mf0, NULL);
str1 = g_regex_replace_literal(mp1, val.c_str(), -1, 0, " ", mf0, NULL);
g_regex_unref(mp1);
- // and the rest
- GRegex *mp2 =g_regex_new("\\s*[\[(]?[0-9]+\\s*(?:of|/)\\s*[0-9]+.", cf0, mf0, NULL);
+ // and the rest. the last check is for pans collapsed part count
+ GRegex *mp2 =g_regex_new("\\s*([\[(]?'?[0-9]+'?\\s*(?:of|/)\\s*'?[0-9]+'?.)|\\(/[0-9]+\\)", cf0, mf0, NULL);
str2 = g_regex_replace_literal(mp2, str1, -1, 0, "", mf0, NULL);
g_free(str1);
g_regex_unref(mp2);
// try to strip out the filename (may fail if it contains spaces)
- GRegex *fn =g_regex_new("\"[^\"]+?\" yEnc.*" "|"
- "\\S++\\s++yEnc.*" "|"
+ GRegex *fn =g_regex_new("\"[^\"]+?\" yEnc.*" "|"
+ "\\S++\\s++yEnc.*" "|"
"\"[^\"]+?\\.\\w{2,}\"" "|"
- "\\S+\\.\\w{2,}", cf0, mf0, NULL);
+ "\\S+\\.\\w{3,4}", cf0, mf0, NULL);
str1 = g_regex_replace_literal(fn, str2, -1, 0, "", mf0, NULL);
g_free(str2);
g_regex_unref(fn);
// try to strip out any byte counts
- GRegex *cnt =g_regex_new("\\[?[0-9]+ *(?:[Bb]ytes|[Kk][Bb]?)\\]?", cf0, mf0, NULL);
+ GRegex *cnt =g_regex_new("\\[?[0-9]+\\s*(?:[Bb](ytes)?|[Kk][Bb]?)\\]?", cf0, mf0, NULL);
str2 = g_regex_replace_literal(cnt, str1, -1, 0, "", mf0, NULL);
g_free(str1);
g_regex_unref(cnt);
// remove any illegal / annoying characters
- GRegex *badc =g_regex_new("[\\\\/<>|*?'\"]+", cf0, mf0, NULL);
- str1 = g_regex_replace_literal(badc, str2, -1, 0, "_", mf0, NULL);
+ GRegex *badc =g_regex_new("[\\\\/<>|*?'\"\\.\\s]+", cf0, mf0, NULL);
+ str1 = g_regex_replace_literal(badc, str2, -1, 0, sep, mf0, NULL);
g_free(str2);
g_regex_unref(badc);
- // remove any extraneous whitespace, '_', and '.'
- GRegex *ext =g_regex_new("[\\s_\\.]+", cf0, mf0, NULL);
+ // remove any extraneous whitespace, '_', & '-'
+ GRegex *ext =g_regex_new("[\\s_-]{2,}", cf0, mf0, NULL);
str2 = g_regex_replace_literal(ext, str1, -1, 0, sep, mf0, NULL);
g_free(str1);
g_regex_unref(ext);
- // remove trailing junk
- ext =g_regex_new("[_-]+$", cf0, mf0, NULL);
+ // remove leading & trailing junk
+ ext =g_regex_new("(^[\\s_-]+)|([\\s_-]+$)", cf0, mf0, NULL);
str1 = g_regex_replace_literal(ext, str2, -1, 0, "", mf0, NULL);
g_free(str2);
g_regex_unref(ext);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]