[tracker-miners] extract/msoffice-xml: Treat zero-length strings as unset properties
- From: Sam Thursfield <sthursfield src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker-miners] extract/msoffice-xml: Treat zero-length strings as unset properties
- Date: Wed, 4 Oct 2017 17:11:16 +0000 (UTC)
commit a1e766cd12610b10617a334489e8d117be337019
Author: Sam Thursfield <sam thursfield codethink co uk>
Date: Thu Sep 28 18:40:32 2017 +0100
extract/msoffice-xml: Treat zero-length strings as unset properties
The MS Office extractor has been producing stuff like this:
<file:///home/sam/Downloads/spreadsheet.xls> nie:comment "" ;
nie:contentLastModified "2016-06-13T14:19:50Z" ;
nie:contentCreated "2016-05-14T10:17:05Z" ;
nie:plainTextContent "..." ;
nie:subject "" ;
a nfo:PaginatedTextDocument ;
nie:title "" .
This breaks queries which use COALESCE to do things like this:
SELECT COALESCE(?nie_title, ?filename) as ?title
If ?nie_title is unset then ?title will be set to the contents of
?filename; but if ?nie_title is present and set to an empty string then
?title will set to that empty string, which is not at all useful.
The extractor will now ignore zero-length strings. Rather than
using strlen() (which has to search to the end of the string)
we just check if the first byte is 0.
https://bugzilla.gnome.org/show_bug.cgi?id=788298
src/tracker-extract/tracker-extract-msoffice-xml.c | 22 ++++++++++----------
1 files changed, 11 insertions(+), 11 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice-xml.c
b/src/tracker-extract/tracker-extract-msoffice-xml.c
index 1c6516c..b45667e 100644
--- a/src/tracker-extract/tracker-extract-msoffice-xml.c
+++ b/src/tracker-extract/tracker-extract-msoffice-xml.c
@@ -434,7 +434,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_title) {
g_warning ("Avoiding additional title (%s) in MsOffice XML document '%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
info->has_title = TRUE;
tracker_resource_set_string (info->metadata, "nie:title", text);
}
@@ -444,7 +444,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_subject) {
g_warning ("Avoiding additional subject (%s) in MsOffice XML document '%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
info->has_subject = TRUE;
tracker_resource_set_string (info->metadata, "nie:subject", text);
}
@@ -454,7 +454,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_publisher) {
g_warning ("Avoiding additional publisher (%s) in MsOffice XML document '%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
TrackerResource *publisher = tracker_extract_new_contact (text);
info->has_publisher = TRUE;
@@ -468,7 +468,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_comment) {
g_warning ("Avoiding additional comment (%s) in MsOffice XML document '%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
info->has_comment = TRUE;
tracker_resource_set_string (info->metadata, "nie:comment", text);
}
@@ -478,7 +478,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_content_created) {
g_warning ("Avoiding additional creation time (%s) in MsOffice XML document '%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
gchar *date;
date = tracker_date_guess (text);
@@ -497,7 +497,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_generator) {
g_warning ("Avoiding additional generator (%s) in MsOffice XML document '%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
info->has_generator = TRUE;
tracker_resource_set_string (info->metadata, "nie:generator", text);
}
@@ -514,7 +514,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_content_last_modified) {
g_warning ("Avoiding additional last modification time (%s) in MsOffice XML document
'%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
gchar *date;
date = tracker_date_guess (text);
@@ -533,7 +533,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_page_count) {
g_warning ("Avoiding additional page count (%s) in MsOffice XML document '%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
info->has_page_count = TRUE;
tracker_resource_set_string (info->metadata, "nfo:pageCount", text);
}
@@ -543,7 +543,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_char_count) {
g_warning ("Avoiding additional character count (%s) in MsOffice XML document '%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
info->has_char_count = TRUE;
tracker_resource_set_string (info->metadata, "nfo:characterCount", text);
}
@@ -553,7 +553,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_word_count) {
g_warning ("Avoiding additional word count (%s) in MsOffice XML document '%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
info->has_word_count = TRUE;
tracker_resource_set_string (info->metadata, "nfo:wordCount", text);
}
@@ -563,7 +563,7 @@ msoffice_xml_metadata_parse (GMarkupParseContext *context,
if (info->has_line_count) {
g_warning ("Avoiding additional line count (%s) in MsOffice XML document '%s'",
text, info->uri);
- } else {
+ } else if (text[0] != '\0') {
info->has_line_count = TRUE;
tracker_resource_set_string (info->metadata, "nfo:lineCount", text);
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]