[tracker/tracker-0.10] tracker-extract: make Abiword extractor use GMarkup
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/tracker-0.10] tracker-extract: make Abiword extractor use GMarkup
- Date: Thu, 5 May 2011 12:59:16 +0000 (UTC)
commit d44f2be04b77a9cbb894a92481e4a4c8bf3689c3
Author: Carlos Garnacho <carlosg gnome org>
Date: Fri Apr 29 12:41:26 2011 +0200
tracker-extract: make Abiword extractor use GMarkup
This enables us to do content extraction properly. Fixes GB#633105,
reported by Aleksander Morgado.
src/tracker-extract/tracker-extract-abw.c | 212 ++++++++++++++++++++++------
1 files changed, 166 insertions(+), 46 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-abw.c b/src/tracker-extract/tracker-extract-abw.c
index f89d088..89f048f 100644
--- a/src/tracker-extract/tracker-extract-abw.c
+++ b/src/tracker-extract/tracker-extract-abw.c
@@ -46,71 +46,191 @@ static TrackerExtractData data[] = {
{ NULL, NULL }
};
+typedef struct AbwParserData AbwParserData;
+typedef enum {
+ ABW_PARSER_TAG_UNHANDLED,
+ ABW_PARSER_TAG_TITLE,
+ ABW_PARSER_TAG_SUBJECT,
+ ABW_PARSER_TAG_CREATOR,
+ ABW_PARSER_TAG_KEYWORDS,
+ ABW_PARSER_TAG_DESCRIPTION,
+ ABW_PARSER_TAG_GENERATOR
+} AbwParserTag;
+
+struct AbwParserData {
+ TrackerSparqlBuilder *metadata;
+ TrackerSparqlBuilder *preupdate;
+ GString *content;
+
+ guint cur_tag;
+ guint in_text : 1;
+};
+
+static void
+abw_parser_start_elem (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error)
+{
+ AbwParserData *data = user_data;
+
+ if (g_strcmp0 (element_name, "m") == 0 &&
+ g_strcmp0 (attribute_names[0], "key") == 0) {
+ if (g_strcmp0 (attribute_values[0], "dc.title") == 0) {
+ data->cur_tag = ABW_PARSER_TAG_TITLE;
+ } else if (g_strcmp0 (attribute_values[0], "dc.subject") == 0) {
+ data->cur_tag = ABW_PARSER_TAG_SUBJECT;
+ } else if (g_strcmp0 (attribute_values[0], "dc.creator") == 0) {
+ data->cur_tag = ABW_PARSER_TAG_CREATOR;
+ } else if (g_strcmp0 (attribute_values[0], "abiword.keywords") == 0) {
+ data->cur_tag = ABW_PARSER_TAG_KEYWORDS;
+ } else if (g_strcmp0 (attribute_values[0], "dc.description") == 0) {
+ data->cur_tag = ABW_PARSER_TAG_DESCRIPTION;
+ } else if (g_strcmp0 (attribute_values[0], "abiword.generator") == 0) {
+ data->cur_tag = ABW_PARSER_TAG_GENERATOR;
+ }
+ } else if (g_strcmp0 (element_name, "section") == 0) {
+ data->in_text = TRUE;
+ }
+}
+
+static void
+abw_parser_text (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error)
+{
+ AbwParserData *data = user_data;
+ gchar *str;
+
+ str = g_strndup (text, text_len);
+
+ switch (data->cur_tag) {
+ case ABW_PARSER_TAG_TITLE:
+ tracker_sparql_builder_predicate (data->metadata, "nie:title");
+ tracker_sparql_builder_object_unvalidated (data->metadata, str);
+ break;
+ case ABW_PARSER_TAG_SUBJECT:
+ tracker_sparql_builder_predicate (data->metadata, "nie:subject");
+ tracker_sparql_builder_object_unvalidated (data->metadata, str);
+ break;
+ case ABW_PARSER_TAG_CREATOR:
+ tracker_sparql_builder_predicate (data->metadata, "nco:creator");
+
+ tracker_sparql_builder_object_blank_open (data->metadata);
+ tracker_sparql_builder_predicate (data->metadata, "a");
+ tracker_sparql_builder_object (data->metadata, "nco:Contact");
+
+ tracker_sparql_builder_predicate (data->metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (data->metadata, str);
+ tracker_sparql_builder_object_blank_close (data->metadata);
+ break;
+ case ABW_PARSER_TAG_DESCRIPTION:
+ tracker_sparql_builder_predicate (data->metadata, "nie:comment");
+ tracker_sparql_builder_object_unvalidated (data->metadata, str);
+ break;
+ case ABW_PARSER_TAG_GENERATOR:
+ tracker_sparql_builder_predicate (data->metadata, "nie:generator");
+ tracker_sparql_builder_object_unvalidated (data->metadata, str);
+ break;
+ case ABW_PARSER_TAG_KEYWORDS:
+ {
+ char *lasts, *keyword;
+
+ for (keyword = strtok_r (str, ",; ", &lasts); keyword;
+ keyword = strtok_r (NULL, ",; ", &lasts)) {
+ tracker_sparql_builder_predicate (data->metadata, "nie:keyword");
+ tracker_sparql_builder_object_unvalidated (data->metadata, keyword);
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (data->in_text) {
+ if (G_UNLIKELY (!data->content)) {
+ data->content = g_string_new ("");
+ }
+
+ g_string_append_len (data->content, text, text_len);
+ }
+
+ data->cur_tag = ABW_PARSER_TAG_UNHANDLED;
+ g_free (str);
+}
+
+static GMarkupParser parser = {
+ abw_parser_start_elem,
+ NULL,
+ abw_parser_text,
+ NULL, NULL
+};
static void
extract_abw (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata)
{
- FILE *f;
- gchar *filename;
+ GMappedFile *file;
+ gchar *filename, *contents;
+ GError *error = NULL;
+ gboolean retval = FALSE;
+ gsize len;
+
+ filename = g_filename_from_uri (uri, NULL, &error);
+
+ if (error) {
+ g_warning ("Could not get filename: %s\n", error->message);
+ g_error_free (error);
+ return;
+ }
- filename = g_filename_from_uri (uri, NULL, NULL);
- f = tracker_file_open (filename, "r", TRUE);
+ file = g_mapped_file_new (filename, FALSE, &error);
g_free (filename);
- if (f) {
- gchar *line;
- gsize length;
- gssize read_char;
+ if (error) {
+ g_warning ("Could not mmap abw file: %s\n", error->message);
+ g_error_free (error);
+ return;
+ }
- line = NULL;
- length = 0;
+ contents = g_mapped_file_get_contents (file);
+ len = g_mapped_file_get_length (file);
+
+ if (contents) {
+ GMarkupParseContext *context;
+ AbwParserData data = { 0 };
+
+ data.metadata = metadata;
+ data.preupdate = preupdate;
tracker_sparql_builder_predicate (metadata, "a");
tracker_sparql_builder_object (metadata, "nfo:Document");
- while ((read_char = tracker_getline (&line, &length, f)) != -1) {
- if (g_str_has_suffix (line, "</m>\n")) {
- line[read_char - 5] = '\0';
- }
-
- if (g_str_has_prefix (line, "<m key=\"dc.title\">")) {
- tracker_sparql_builder_predicate (metadata, "nie:title");
- tracker_sparql_builder_object_unvalidated (metadata, line + 18);
- } else if (g_str_has_prefix (line, "<m key=\"dc.subject\">")) {
- tracker_sparql_builder_predicate (metadata, "nie:subject");
- tracker_sparql_builder_object_unvalidated (metadata, line + 20);
- } else if (g_str_has_prefix (line, "<m key=\"dc.creator\">")) {
- tracker_sparql_builder_predicate (metadata, "nco:creator");
- tracker_sparql_builder_object_unvalidated (metadata, line + 20);
- } else if (g_str_has_prefix (line, "<m key=\"abiword.keywords\">")) {
- gchar *keywords = g_strdup (line + 26);
- char *lasts, *keyw;
-
- for (keyw = strtok_r (keywords, ",; ", &lasts); keyw;
- keyw = strtok_r (NULL, ",; ", &lasts)) {
- tracker_sparql_builder_predicate (metadata, "nie:keyword");
- tracker_sparql_builder_object_unvalidated (metadata, keyw);
- }
-
- g_free (keywords);
- } else if (g_str_has_prefix (line, "<m key=\"dc.description\">")) {
- tracker_sparql_builder_predicate (metadata, "nie:comment");
- tracker_sparql_builder_object_unvalidated (metadata, line + 24);
+ context = g_markup_parse_context_new (&parser, 0, &data, NULL);
+ g_markup_parse_context_parse (context, contents, len, &error);
+
+ if (error) {
+ g_warning ("Could not parse abw file: %s\n", error->message);
+ g_error_free (error);
+ } else {
+ if (data.content) {
+ tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated (metadata, data.content->str);
+ g_string_free (data.content, TRUE);
}
- g_free (line);
- line = NULL;
- length = 0;
+ retval = TRUE;
}
- if (line) {
- g_free (line);
- }
-
- tracker_file_close (f, FALSE);
+ g_markup_parse_context_free (context);
}
+
+ g_mapped_file_unref (file);
}
TrackerExtractData *
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]