[Tracker] patch for extract html information




two problem:

1. html tag are case insensitive, should use strcasecmp instead of strcmp

2. it will core dump when meet some html page which contains
"<a>abc</a>"  or "<meta>"


patch attached.


Index: src/tracker-extract/tracker-extract-html.c
===================================================================
--- src/tracker-extract/tracker-extract-html.c  (revision 724)
+++ src/tracker-extract/tracker-extract-html.c  (working copy)
@@ -37,11 +37,14 @@
 gboolean
 has_attribute( const xmlChar ** atts, const char *attr, const char*val )
 {
+        if (atts == NULL || attr == NULL || val == NULL)
+        return FALSE;
+
        int i;
        for ( i = 0; atts[i]; i+=2 )
        {
-               if ( strcmp((char*)atts[i],attr) == 0 ) {
-                       if ( !val || strcmp((char*)atts[i+1],val) == 0 ) {
+               if ( strcasecmp((char*)atts[i],attr) == 0 ) {
+                       if ( !val || strcasecmp((char*)atts[i+1],val) == 0 ) {
                                return TRUE;
                        }
                }
@@ -55,7 +58,7 @@
        int i;
        for ( i = 0; atts[i]; i+=2 )
        {
-               if ( strcmp((char*)atts[i],attr) == 0 ) {
+               if ( strcasecmp((char*)atts[i],attr) == 0 ) {
                        return atts[i+1];
                }
        }
@@ -67,7 +70,7 @@
 startElement (void * info, const xmlChar * name, const xmlChar ** atts)
 {
        /* Look for RDFa triple describing the license */
-       if ( strcmp((char*)name,"a") == 0 ) {
+       if ( strcasecmp((char*)name,"a") == 0 ) {
                /* This tag is a license.  Ignore, however, if it is referring to another document */
                if ( has_attribute(atts,"rel","license") && !has_attribute(atts,"about",NULL) ) {
                        const xmlChar *href = lookup_attribute(atts,"href");
@@ -76,9 +79,9 @@
                                                     g_strdup( (char*)href ));
                        }
                }
-       } else if ( strcmp((char*)name,"title") == 0 ) {
+       } else if ( strcasecmp((char*)name,"title") == 0 ) {
                ((HTMLParseInfo *)info)->current = READ_TITLE;
-       } else if ( strcmp((char*)name,"meta") == 0 ) {
+       } else if ( strcasecmp((char*)name,"meta") == 0 ) {
                if ( has_attribute(atts,"name","Author") ) {
                        const xmlChar *author = lookup_attribute(atts,"content");
                        if ( author ) {


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]