Index: tracker-extract.c =================================================================== --- tracker-extract.c (revision 542) +++ tracker-extract.c (working copy) @@ -52,6 +52,7 @@ void tracker_extract_totem (gchar *, GHashTable *); void tracker_extract_oasis (gchar *, GHashTable *); void tracker_extract_ps (gchar *, GHashTable *); +void tracker_extract_manpage (gchar *, GHashTable *); #ifdef HAVE_POPPLER void tracker_extract_pdf (gchar *, GHashTable *); #endif @@ -84,6 +85,7 @@ /* Document extractors */ { "application/vnd.oasis.opendocument.*", tracker_extract_oasis }, { "application/postscript", tracker_extract_ps }, + { "text/troff", tracker_extract_manpage }, #ifdef HAVE_POPPLER { "application/pdf", tracker_extract_pdf }, #endif Index: tracker-extract-manpage.c =================================================================== --- tracker-extract-manpage.c (revision 0) +++ tracker-extract-manpage.c (revision 0) @@ -0,0 +1,129 @@ +/* Tracker Extract - extracts embedded metadata from manpages + * Copyright (C) 2007, Michael Frank (msfrank syntaxjockey com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + + +#include "config.h" + +#include +#include +#include +#include + +void tracker_extract_manpage (gchar *filename, GHashTable *metadata) +{ + FILE *fp; + gchar buffer[256]; + gboolean parsed_title = FALSE, parsed_name_section = FALSE; + + fp = g_fopen (filename, "r"); + if (fp == NULL) + return; + + while (fgets (buffer, 256, fp)) { + if (!parsed_title && !strncmp (buffer, ".TH", 3)) { + gchar *token = strtok (buffer + 3, " \t\v\r\f\n"); + + if (token != NULL) + g_hash_table_insert (metadata, g_strdup ("Doc:Title"), g_strdup (token)); + token = strtok (NULL, " \t\v\r\f\n"); + if (token != NULL) { + guint64 section_num = 0; + gchar *end_ptr = NULL; + + /* some man pages quote the section number */ + if (token[0] == '\"') { + gchar *end_quote = strchr (++token, '\"'); + if (end_quote) + *end_quote = '\0'; + } + g_hash_table_insert (metadata, g_strdup ("Man:Section"), g_strdup (token)); + section_num = g_ascii_strtoull (token, &end_ptr, 10); + if (end_ptr > token) { + /* TODO: these strings need to be translated */ + switch (section_num) { + case 1: + g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Executable Programs And Shell Commands")); + break; + case 2: + g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("System Calls")); + break; + case 3: + g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Library Calls")); + break; + case 4: + g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Special Files")); + break; + case 5: + g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("File Formats And Conventions")); + break; + case 6: + g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Games")); + break; + case 7: + g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Miscellaneous")); + break; + case 8: + g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("System Administration Commands")); + break; + case 9: + g_hash_table_insert (metadata, g_strdup ("Doc:Subject"), g_strdup ("Kernel Routines")); + break; + default: + break; + } + } + } + /* TODO: is this worth it? the date is pretty fuzzy and kinda non-stanard + token = strtok (NULL, " \t\v\r\f\n"); + if (token != NULL) { + GDate cdate; + struct tm ctm; + g_date_set_parse (&cdate, token); + if (g_date_valid (&cdate)) { + g_date_to_struct_tm (&cdate, &ctm); + g_hash_table_insert (metadata, g_strdup ("Doc:Created"), asctime (&ctm)); + } + } + */ + parsed_title = TRUE; + } + + if (!parsed_name_section && !strncmp (buffer, ".SH NAME", 8)) { + GString *name = g_string_new (NULL); + gchar *desc; + while (fgets (buffer, 256, fp)) { + if (!strncmp (buffer, ".SH", 3)) + break; + name = g_string_append (name, buffer); + } + desc = strstr (name->str, "\\-"); + if (desc) + desc += 2; + else + desc = name->str; + g_hash_table_insert (metadata, g_strdup ("Doc:Description"), g_strdup (desc)); + g_string_free (name, TRUE); + parsed_name_section = TRUE; + } + } + + if (ferror (fp)) + g_debug ("error parsing manpage '%s'", filename); + fclose (fp); +} Index: Makefile.am =================================================================== --- Makefile.am (revision 542) +++ Makefile.am (working copy) @@ -33,6 +33,7 @@ tracker-extract-imagemagick.c \ tracker-extract-mplayer.c \ tracker-extract-totem.c \ + tracker-extract-manpage.c \ $(video_sources) tracker_extract_LDADD = $(GLIB2_LIBS) \