Yelp Searching without Beagle
- From: Don Scorgie <DonScorgie Blueyonder co uk>
- To: Doc Devel List <gnome-doc-devel-list gnome org>
- Subject: Yelp Searching without Beagle
- Date: Thu, 12 Jan 2006 00:48:38 +0000
Hi all,
For the last little while, I've been working on creating a fallback for
Yelp searching when Beagle is unavailable. This is now getting to the
point where I can share the first version of the code, attached.
It runs through all the documents registered with scrollkeeper
(excluding man and info pages).
In order to compile, try the configure options
./configure <everything else> --enable-search --enable-beagle=no
It takes around 1 - 1.5 seconds to run a full search on my box (YMMV).
A couple of points about it:
1. It is unbelievably stupid at the moment and will match any text
anywhere in the document, even if its not normally visible to the user
(i.e. translator credits etc.). I intend to fix this as soon as I get a
2. Its also stupid about search phrases. Just now, it only searches for
an exact phrase and wont match over elements. See below.
3. It should be seperate from the beagle code *but* there is a small
section I changed (in the search2html stylesheet) that makes the
searched for phrase bold.
4. All the results are (seemingly) in no order of importance. Again,
I'll try fixing up a scoring mechanism when I get the chance.
Now, the reason I'm posting to the list:
1. Its rapidly approaching freature freeze (this sunday) and if this is
going to make 2.14, its gotta be committed before that happens.
Otherwise, it'll have to wait for 2.16
2. Any thoughts on the best way of doing the searching out of the
AND words together. This will only return a hit if a document contains
all the words entered by the user (not necessarily together)
OR words together. This will return a result if one or more of the
words appear in the document.
3. Currently, it stops searching a document the first time its search
criteria is fulfilled (just now, the first time the search phrase is
found) and moves on to the next document. Is this the best course of
action, or would it be preferable to return >1 hit per document.
3. Please test the patch and report back. The more its tested, the
better it will get.
Index: src/yelp-search-pager.c
RCS file: /cvs/gnome/yelp/src/yelp-search-pager.c,v
retrieving revision 1.6
diff -u -r1.6 yelp-search-pager.c
--- src/yelp-search-pager.c 6 Jan 2006 15:00:10 -0000 1.6
+++ src/yelp-search-pager.c 12 Jan 2006 00:25:16 -0000
@@ -24,6 +24,7 @@
#include <config.h>
+#include <sys/time.h>
#include <string.h>
#include <glib.h>
#include <glib/gi18n.h>
@@ -42,7 +43,9 @@
#include <libxslt/xsltInternals.h>
#include <libxslt/xsltutils.h>
#include <beagle/beagle.h>
+#endif /* ENABLE_BEAGLE */
#include "yelp-error.h"
#include "yelp-settings.h"
@@ -67,6 +70,8 @@
typedef struct _YelpListing YelpListing;
+typedef struct _SearchContainer SearchContainer;
struct _YelpSearchPagerPriv {
@@ -94,6 +99,54 @@
gboolean has_listings;
+struct _SearchContainer {
+ gboolean result_found;
+ gchar * current_subsection;
+ gchar * result_subsection;
+ gchar * doc_title;
+ gchar * base_path;
+ gchar * base_filename;
+ gchar * snippet;
+ GSList * components;
+ GHashTable *entities;
+ gchar * search_term;
+static gboolean
+do_timing (void)
+ static gboolean first_call = TRUE;
+ static struct timeval before;
+ static struct timeval after;
+ if (first_call) {
+ gettimeofday (&before, NULL);
+ first_call = FALSE;
+ } else {
+ gint sec=0, usec=0;
+ gettimeofday (&after, NULL);
+ sec = after.tv_sec - before.tv_sec;
+ if (after.tv_sec > before.tv_sec) {
+ if (after.tv_usec < before.tv_usec) {
+ usec = (1000000 - before.tv_usec) + after.tv_usec;
+ sec--;
+ } else usec = after.tv_usec - before.tv_usec;
+ } else usec = after.tv_usec - before.tv_usec;
+ g_print ("before : sec=%d, usec=%d\n", (int)before.tv_sec, (int)before.tv_usec);
+ g_print ("after : sec=%d, usec=%d\n", (int)after.tv_sec, (int)after.tv_usec);
+ g_print ("elapsed: sec=%d, usec=%d\n", (int)sec, (int)usec);
+ first_call = TRUE;
+ }
+ /* needed to indicate we are done processing */
+ return FALSE;
static void search_pager_class_init (YelpSearchPagerClass *klass);
static void search_pager_init (YelpSearchPager *pager);
static void search_pager_dispose (GObject *gobject);
@@ -118,9 +171,36 @@
static YelpPagerClass *parent_class;
+static void s_startElement (void *data,
+ const xmlChar *name,
+ const xmlChar **attrs);
+static void s_endElement (void *data,
+ const xmlChar *name);
+static void s_characters (void *data,
+ const xmlChar *ch,
+ int len);
+static void s_declEntity (void *data,
+ const xmlChar *name,
+ int type,
+ const xmlChar *pID,
+ const xmlChar *sID,
+ xmlChar *content);
+static xmlEntityPtr s_getEntity (void *data,
+ const xmlChar *name);
+static gboolean slow_search_setup (YelpSearchPager *pager);
+static gboolean slow_search_process (YelpSearchPager *pager);
+static void search_parse_result (YelpSearchPager *pager,
+ SearchContainer *c);
+static gchar * search_clean_snippet (gchar *snippet,
+ gchar *terms);
static BeagleClient *beagle_client;
+#endif /* ENABLE_BEAGLE */
static char const * const * langs;
+static GSList * pending_searches = NULL;
yelp_search_pager_get_type (void)
@@ -153,8 +233,10 @@
parent_class = g_type_class_peek_parent (klass);
beagle_client = beagle_client_new (NULL);
d(g_print ("client: %p\n", beagle_client);)
+#endif /* ENABLE_BEAGLE */
langs = g_get_language_names ();
@@ -305,6 +387,7 @@
d (g_print ("search_pager_process\n"));
if (beagle_client == NULL) {
GError *error = NULL;
g_set_error (&error, YELP_ERROR, YELP_ERROR_PROC,
@@ -313,6 +396,7 @@
yelp_pager_error (YELP_PAGER (pager), error);
return FALSE;
+#endif /* ENABLE_BEAGLE */
yelp_pager_set_state (pager, YELP_PAGER_STATE_PARSING);
g_signal_emit_by_name (pager, "parse");
@@ -343,7 +427,7 @@
static void
check_finished (YelpSearchPager *pager)
@@ -543,19 +627,23 @@
check_finished (pager);
+#endif /* ENABLE_BEAGLE */
static gboolean
search_pager_process_idle (YelpSearchPager *pager)
BeagleQuery *query;
- YelpSearchPagerPriv *priv = YELP_SEARCH_PAGER (pager)->priv;
GError *error = NULL;
+#endif /* ENABLE_BEAGLE */
+ YelpSearchPagerPriv *priv = YELP_SEARCH_PAGER (pager)->priv;
priv->search_doc = xmlNewDoc (BAD_CAST "1.0");
priv->root = xmlNewNode (NULL, BAD_CAST "search");
xmlSetProp (priv->root, BAD_CAST "title", BAD_CAST priv->search_terms);
xmlDocSetRootElement (priv->search_doc, priv->root);
query = beagle_query_new ();
beagle_query_set_max_hits (query, 10000);
@@ -584,6 +672,13 @@
g_clear_error (&error);
+#endif /* ENABLE_BEAGLE */
+ do_timing ();
+ gtk_idle_add ((GtkFunction) slow_search_setup,
+ pager);
return FALSE;
@@ -600,7 +695,7 @@
GtkIconInfo *info;
GtkIconTheme *theme = (GtkIconTheme *) yelp_settings_get_icon_theme ();
- d(xmlDocFormatDump(stdout, priv->search_doc, 1));
+ d (xmlDocFormatDump(stdout, priv->search_doc, 1));
priv->stylesheet = xsltParseStylesheetFile (BAD_CAST SEARCH_STYLESHEET);
if (!priv->stylesheet) {
@@ -674,6 +769,7 @@
g_signal_emit_by_name (pager, "finish");
+ do_timing ();
return FALSE;
@@ -795,4 +891,408 @@
xmlFreeDoc (new_doc);
if (style)
xsltFreeStylesheet (style);
+static gboolean sk_docomf = FALSE;
+static GSList *omf_pending = NULL;
+static void
+sk_startElement (void *empty, const xmlChar *name,
+ const xmlChar **attrs)
+ if (xmlStrEqual((const xmlChar*) name, BAD_CAST "docomf"))
+ sk_docomf = TRUE;
+static void
+sk_endElement (void *empty, const xmlChar *name)
+ if (xmlStrEqual((const xmlChar*) name, BAD_CAST "docomf"))
+ sk_docomf = FALSE;
+static void
+sk_characters (void *empty, const xmlChar *ch,
+ int len)
+ gchar *omf;
+ if (sk_docomf) {
+ omf = g_strndup ((gchar *) ch, len);
+ omf_pending = g_slist_prepend (omf_pending, omf);
+ }
+void s_startElement(void *data,
+ const xmlChar * name,
+ const xmlChar ** attrs)
+ SearchContainer *c = (SearchContainer *) data;
+ if (attrs) {
+ gint i=0;
+ while (attrs[i]) {
+ if (g_str_equal (attrs[i], "id")) {
+ g_free (c->current_subsection);
+ c->current_subsection = g_strdup ((gchar *) attrs[i+1]);
+ }
+ i+=2;
+ }
+ }
+ return;
+void s_endElement(void * data,
+ const xmlChar * name)
+ return;
+void s_characters(void * data,
+ const xmlChar * ch,
+ int len)
+ SearchContainer *c = (SearchContainer *) data;
+ if (!c->result_found) {
+ gchar *tmp = g_utf8_casefold ((gchar *) ch, len);
+ if (strstr (tmp, c->search_term)) {
+ c->result_found = TRUE;
+ c->snippet = g_strndup (g_utf8_casefold ((gchar *) ch, len),
+ len);
+ c->result_subsection = g_strdup (c->current_subsection);
+ }
+ g_free (tmp);
+ }
+ return;
+void s_declEntity (void *data, const xmlChar *name, int type,
+ const xmlChar *pID, const xmlChar *sID,
+ xmlChar *content)
+ SearchContainer *c = (SearchContainer *) data;
+ if (type == 2) {
+ g_hash_table_insert (c->entities,
+ g_strdup ((gchar *) name),
+ g_strdup ((gchar *) sID));
+ }
+ return;
+xmlEntityPtr s_getEntity (void *data, const xmlChar *name)
+ SearchContainer *c = (SearchContainer *) data;
+ xmlEntityPtr t = xmlGetPredefinedEntity(name);
+ if (!t) {
+ gchar * lookup = g_hash_table_lookup (c->entities, name);
+ if (lookup) {
+ c->components = g_slist_append (c->components,
+ g_strconcat (c->base_path,
+ "/",
+ lookup, NULL));
+ }
+ }
+ return t;
+static xmlSAXHandler handlers = {
+ s_getEntity,
+ s_declEntity, NULL,
+ s_startElement, s_endElement, NULL, s_characters,
+/* Parse the omfs and build the list of files to be searched */
+static gboolean
+slow_search_setup (YelpSearchPager *pager)
+ gchar *content_list;
+ gchar *stderr_str;
+ gchar *lang;
+ gchar *command;
+ static xmlSAXHandler sk_sax_handler = { 0, };
+ xmlParserCtxtPtr parser;
+ if (langs && langs[0])
+ lang = (gchar *) langs[0];
+ else
+ lang = "C";
+ command = g_strconcat("scrollkeeper-get-content-list ", lang, NULL);
+ if (g_spawn_command_line_sync (command, &content_list, &stderr_str, NULL, NULL)) {
+ if (!sk_sax_handler.startElement) {
+ sk_sax_handler.startElement = sk_startElement;
+ sk_sax_handler.endElement = sk_endElement;
+ sk_sax_handler.characters = sk_characters;
+ sk_sax_handler.initialized = TRUE;
+ }
+ content_list = g_strstrip (content_list);
+ xmlSAXUserParseFile (&sk_sax_handler, NULL, content_list);
+ }
+ parser = xmlNewParserCtxt ();
+ g_free (content_list);
+ g_free (stderr_str);
+ g_free (command);
+ while (omf_pending) {
+ GSList *first = NULL;
+ gchar *file = NULL;
+ xmlDocPtr omf_doc = NULL;
+ xmlXPathContextPtr omf_xpath = NULL;
+ xmlXPathObjectPtr omf_url = NULL;
+ xmlXPathObjectPtr omf_title = NULL;
+ SearchContainer *container;
+ gchar *ptr;
+ gchar *path;
+ gchar *fname;
+ gchar *realfname;
+ first = omf_pending;
+ omf_pending = g_slist_remove_link (omf_pending, first);
+ file = (gchar *) first->data;
+ omf_doc = xmlCtxtReadFile (parser, (const char *) file, NULL,
+ if (!omf_doc) {
+ g_warning (_("Could not load the OMF file '%s'."), file);
+ continue;
+ }
+ omf_xpath = xmlXPathNewContext (omf_doc);
+ omf_url =
+ xmlXPathEvalExpression (BAD_CAST
+ "string(/omf/resource/identifier/@url)",
+ omf_xpath);
+ omf_title =
+ xmlXPathEvalExpression (BAD_CAST
+ "string(/omf/resource/title)",
+ omf_xpath);
+ fname = g_strdup ((gchar *) omf_url->stringval);
+ if (g_str_has_prefix (fname, "file:")) {
+ realfname = &fname[5];
+ } else {
+ realfname = fname;
+ }
+ if (!g_file_test (realfname, G_FILE_TEST_EXISTS)) {
+ continue;
+ }
+ container = g_new0 (SearchContainer, 1);
+ container->result_found = FALSE;
+ container->base_filename = g_strdup (realfname);
+ container->entities = g_hash_table_new (g_str_hash, g_str_equal);
+ container->doc_title = g_strdup ((gchar *) omf_title->stringval);
+ ptr = g_strrstr (container->base_filename, "/");
+ path = g_strndup (container->base_filename,
+ ptr - container->base_filename);
+ container->base_path = g_strdup (path);
+ container->search_term = g_utf8_casefold (pager->priv->search_terms,
+ -1);
+ pending_searches = g_slist_prepend (pending_searches, container);
+ g_free (fname);
+ g_free (path);
+ if (omf_url)
+ xmlXPathFreeObject (omf_url);
+ if (omf_title)
+ xmlXPathFreeObject (omf_title);
+ if (omf_xpath)
+ xmlXPathFreeContext (omf_xpath);
+ if (omf_doc)
+ xmlFreeDoc (omf_doc);
+ }
+ gtk_idle_add ((GtkFunction) slow_search_process,
+ pager);
+ if (parser)
+ xmlFreeParserCtxt (parser);
+ return FALSE;
+static gboolean
+slow_search_process (YelpSearchPager *pager)
+ SearchContainer *c;
+ GSList *first = pending_searches;
+ pending_searches = g_slist_remove_link (pending_searches, first);
+ c = (SearchContainer *) first->data;
+ xmlSAXUserParseFile (&handlers, c, c->base_filename);
+ if (c->result_found) {
+ search_parse_result (pager, c);
+ } else while (c->components) {
+ GSList *next = c->components;
+ c->components = g_slist_remove_link (c->components, next);
+ xmlSAXUserParseFile (&handlers, c, (gchar *) next->data);
+ if (c->result_found) {
+ search_parse_result (pager, c);
+ break;
+ }
+ }
+ /* Cleanup the container and delete it */
+ g_free (c->current_subsection);
+ g_free (c->result_subsection);
+ g_free (c->doc_title);
+ g_free (c->base_path);
+ g_free (c->base_filename);
+ g_free (c->snippet);
+ g_hash_table_destroy (c->entities);
+ g_free (c->search_term);
+ g_free (c);
+ if (pending_searches)
+ return TRUE;
+ else {
+ gtk_idle_add_priority (G_PRIORITY_LOW,
+ (GtkFunction) process_xslt,
+ pager);
+ return FALSE;
+ }
+gchar *
+search_clean_snippet (gchar *snippet, gchar *terms)
+ /* This is probably what you want to change */
+ gint len_before_term = 47;
+ gint len_after_term = 37;
+ gchar *result = NULL;
+ gchar **before = NULL;
+ gchar *after = NULL;
+ gchar *tmp = NULL;
+ gchar * before_string;
+ gchar * after_string;
+ gchar *t;
+ gint blen, alen;
+ before = g_strsplit (snippet, terms, 2);
+ if (g_strv_length (before) !=2) {
+ /* The portion before or after the snippet is missing
+ * so we make it up as ''
+ */
+ gchar *tmp = g_strdup (before[0]);
+ g_strfreev (before);
+ before = g_new0 (gchar *, 2);
+ if (g_str_has_prefix (snippet, terms)) {
+ before[0] = g_strdup ("");
+ before[1] = g_strdup (tmp);
+ } else {
+ before[0] = g_strdup (tmp);
+ before[1] = g_strdup ("");
+ }
+ g_free (tmp);
+ }
+ /* Now, strip the shite from the begining and end */
+ after = before[0];
+ tmp = before[1]+strlen(before[1])-1;
+ while (!g_ascii_isalnum (*after) && after != before[0]+strlen (before[0]))
+ after++;
+ while (!g_ascii_isalnum (*tmp) && tmp !=before[1]) {
+ tmp--;
+ }
+ tmp++;
+ before_string = g_strdup (after);
+ if (tmp-before[1] > 0) {
+ after_string = g_strndup (before[1], tmp-before[1]);
+ } else {
+ after_string = g_strdup (before[1]);
+ }
+ /* Now, reduce it to a managable size */
+ blen = strlen (before_string);
+ alen = strlen (after_string);
+ if (blen > len_before_term) {
+ gchar *tmp = before_string + strlen(before_string) - len_before_term;
+ t = g_strndup (tmp, len_before_term);
+ g_free (before_string);
+ before_string = g_strconcat ("...", t, NULL);
+ g_free (t);
+ }
+ if (alen > len_after_term) {
+ t = g_strndup (after_string, len_after_term);
+ g_free (after_string);
+ after_string = g_strconcat (t, "...", NULL);
+ g_free (t);
+ }
+ /* Finally, piece all these together into a complete snippet
+ * with markup and everything
+ */
+ result = g_strconcat (before_string, "<b>", terms, "</b>",
+ after_string, NULL);
+ g_free (before_string);
+ g_free (after_string);
+ g_strfreev (before);
+ return result;
+search_parse_result (YelpSearchPager *pager, SearchContainer *c)
+ xmlNode *child;
+ gchar *new_uri;
+ xmlDoc *snippet_doc;
+ xmlNode *node;
+ char *xmldoc;
+ new_uri = g_strconcat (c->base_filename, "#", c->result_subsection,
+ NULL);
+ child = xmlNewTextChild (pager->priv->root, NULL,
+ BAD_CAST "result", NULL);
+ xmlSetProp (child, BAD_CAST "uri", BAD_CAST new_uri);
+ xmlSetProp (child, BAD_CAST "title", BAD_CAST g_strstrip (c->doc_title));
+ /* Fix up the snippet to show the terms in bold */
+ xmldoc = g_strdup_printf ("<snippet>%s</snippet>",
+ search_clean_snippet (c->snippet, c->search_term));
+ snippet_doc = xmlParseDoc (BAD_CAST xmldoc);
+ g_free (xmldoc);
+ if (!snippet_doc)
+ return;
+ node = xmlDocGetRootElement (snippet_doc);
+ xmlUnlinkNode (node);
+ xmlAddChild (child, node);
+ xmlFreeDoc (snippet_doc);
Index: stylesheets/search2html.xsl
RCS file: /cvs/gnome/yelp/stylesheets/search2html.xsl,v
retrieving revision 1.2
diff -u -r1.2 search2html.xsl
--- stylesheets/search2html.xsl 1 Nov 2005 20:05:58 -0000 1.2
+++ stylesheets/search2html.xsl 12 Jan 2006 00:25:16 -0000
@@ -158,6 +158,8 @@
<xsl:template match="b">
- <xsl:apply-templates/>
+ <strong>
+ <xsl:apply-templates/>
+ </strong>
RCS file: /cvs/gnome/yelp/,v
retrieving revision 1.199
diff -u -r1.199
--- 7 Jan 2006 01:23:52 -0000 1.199
+++ 12 Jan 2006 00:25:16 -0000
@@ -124,25 +124,38 @@
- [turn on beagle search support [default=auto]])],,
+ [turn on search support [default=auto]])],,
+ [AC_HELP_STRING([--enable-beagle],
+ [use Beagle to search [default=no]])],,
+ enable_beagle=auto)
-if test "x$enable_search" = "xyes"; then
+if test "x$enable_beagle" = "xyes"; then
-elif test "x$enable_search" = "xno"; then
+elif test "x$enable_beagle" = "xno"; then
- enable_search=yes,enable_search=no)
+ enable_beagle=yes,enable_beagle=no)
if test "x$enable_search" = "xyes"; then
- AC_DEFINE(ENABLE_SEARCH, 1, [turn on beagle search support])
+ AC_DEFINE(ENABLE_SEARCH, 1, [turn on search support])
+if test "x$enable_beagle" = "xyes"; then
+ AC_DEFINE(ENABLE_BEAGLE, 1, [Use beagle for searching])
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
Thread Index]
Date Index]
Author Index]