[bijiben] Rework serialization format to be XHTML

From: Giovanni Campagna <gcampagna src gnome org>
To: commits-list gnome org
Cc:
Subject: [bijiben] Rework serialization format to be XHTML
Date: Wed, 2 Jan 2013 23:40:53 +0000 (UTC)
commit 817a1853591e8f521bc8cf3796b6fa709559f366
Author: Giovanni Campagna <gcampagna src gnome org>
Date:   Sat Dec 29 17:56:24 2012 +0100

    Rework serialization format to be XHTML
    
    Trying to parse HTML using XML tools is hacky and prone to fail in
    spectacular ways. And when it fails, we risk losing data, which is very
    bad.
    Instead, we can make WebKit give us XM and store it directly, using XHTML.
    The downside of this is that we need to give WebKit the full document
    (including <html xmlns="..."><body>...). The upside is that, as we no
    longer parse the note content, anything that can be represented in XHTML
    (including images, forms, scripts, plugins...) can now be stored in a
    note.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=690860

 src/libbiji/biji-note-book.c                      |   14 ++-
 src/libbiji/deserializer/biji-lazy-deserializer.c |  210 ++++++---------------
 src/libbiji/editor/biji-webkit-editor.c           |   29 ++--
 src/libbiji/serializer/biji-lazy-serializer.c     |  172 +-----------------
 4 files changed, 81 insertions(+), 344 deletions(-)
---
diff --git a/src/libbiji/biji-note-book.c b/src/libbiji/biji-note-book.c
index 345cb2a..bbacdc2 100644
--- a/src/libbiji/biji-note-book.c
+++ b/src/libbiji/biji-note-book.c
@@ -487,19 +487,29 @@ biji_note_book_get_new_note_from_string (BijiNoteBook *book,
   return ret;
 }
 
+static char*
+wrap_note_content (char *content)
+{
+  return g_strdup_printf("<html xmlns=\"http://www.w3.org/1999/xhtml\";><body>%s</body></html>", content);
+}
+
+
 BijiNoteObj *
 biji_note_book_new_note_with_text (BijiNoteBook *book,
                                    gchar *plain_text)
 {
   BijiNoteObj *ret = get_note_skeleton (book);
   gchar *unique_title = biji_note_book_get_unique_title (book, DEFAULT_NOTE_TITLE);
+  gchar *html;
 
   /* Note will copy title, raw_text and html strings */
   biji_note_obj_set_title (ret, unique_title);
   g_free (unique_title);
 
-  biji_note_obj_set_raw_text (ret, g_strdup (plain_text));
-  biji_note_obj_set_html_content (ret, plain_text);
+  biji_note_obj_set_raw_text (ret, plain_text);
+  html = wrap_note_content (plain_text);
+  biji_note_obj_set_html_content (ret, html);
+  g_free (html);
 
   biji_note_obj_save_note (ret);
   biji_note_book_append_new_note (book, ret, TRUE);
diff --git a/src/libbiji/deserializer/biji-lazy-deserializer.c b/src/libbiji/deserializer/biji-lazy-deserializer.c
index f987c30..20ab2db 100644
--- a/src/libbiji/deserializer/biji-lazy-deserializer.c
+++ b/src/libbiji/deserializer/biji-lazy-deserializer.c
@@ -59,13 +59,14 @@ struct _BijiLazyDeserializerPrivate
   BijiXmlType type;
   xmlTextReaderPtr r;
 
-  /* Reader for internal content, either tomboy XML or Bijiben html */
+  /* Reader for internal content, either tomboy html or Bijiben xhtml */
   xmlTextReaderPtr inner;
   gchar *content;
 
   /* Result for both raw_text & html */
   GString *raw_text;
   GString *html;
+  gboolean seen_content;
 };
 
 static void
@@ -283,6 +284,8 @@ process_tomboy_xml_content (BijiLazyDeserializer *self)
   int ret;
   gchar *revamped_html;
 
+  g_string_append (priv->html, "<html xmlns=\"http://www.w3.org/1999/xhtml\";><body>");
+
   priv->inner = xmlReaderForMemory (priv->content,
                                     strlen(priv->content),
                                     "", "UTF-8", 0);
@@ -290,12 +293,14 @@ process_tomboy_xml_content (BijiLazyDeserializer *self)
   ret = xmlTextReaderRead (priv->inner);
 
   /* Make the GString grow as we read */
-  while (ret == 1) 
+  while (ret == 1)
   {
     process_tomboy_node (self);
     ret = xmlTextReaderRead (priv->inner);
   }
 
+  g_string_append (priv->html, "</body></html>");
+
   /* Now the inner content is known, we can
    * assign note values and let deserialization work on last elements*/
   biji_note_obj_set_raw_text (priv->note, priv->raw_text->str);
@@ -315,133 +320,35 @@ process_bijiben_start_elem (BijiLazyDeserializer *self)
 
   element_name = (const gchar *) xmlTextReaderConstName(priv->inner);
 
-  if (g_strcmp0 (element_name, "note-content")==0)
-    return;
-
-  if (g_strcmp0 (element_name, "div")==0)
-    priv->html = g_string_append (priv->html, "&#xA;");
-
-  if (g_strcmp0 (element_name, "br")==0)
-  {
-    priv->html = g_string_append (priv->html, "<br/>");
-    priv->raw_text = g_string_append (priv->raw_text, "\n");
+  /* Block level elements introduce a new line, except that blocks
+     at the very beginning of their parent don't, and <br/> at the
+     end of a block causes the next block to skip the new line.
+     list-item elements add also a bullet at the beginning.
+
+     These are the only block elements we produce and therefore
+     support. If you manage to introduce more (eg. by copy-pasting),
+     you accept that the result may not be faithful.
+
+     TODO: use webkit_web_view_get_snapshot() instead of showing
+     the raw text content in the main view.
+  */
+  if (g_strcmp0 (element_name, "br") == 0) {
+    g_string_append (priv->raw_text, "\n");
+    priv->seen_content = FALSE;
   }
 
-  if (g_strcmp0 (element_name, "b")==0)
-    priv->html = g_string_append (priv->html, "<b>");
-
-  if (g_strcmp0 (element_name, "i")==0)
-    priv->html = g_string_append (priv->html, "<i>");
-
-  if (g_strcmp0 (element_name, "strike")==0)
-    priv->html = g_string_append (priv->html, "<strike>");
-
-  /* Lists */
-
-  if (g_strcmp0 (element_name, "ul")==0)
-    priv->html = g_string_append (priv->html, "<ul>");
-
-  if (g_strcmp0 (element_name, "ol")==0)
-    priv->html = g_string_append (priv->html, "<ol>");
-
-  if (g_strcmp0 (element_name, "li")==0)
-  {
-    priv->html = g_string_append (priv->html, "<li>");
+  if (priv->seen_content &&
+      (g_strcmp0 (element_name, "div") == 0 ||
+       g_strcmp0 (element_name, "br") == 0 ||
+       g_strcmp0 (element_name, "ul") == 0 ||
+       g_strcmp0 (element_name, "ol") == 0 ||
+       g_strcmp0 (element_name, "li") == 0)) {
+    g_string_append (priv->raw_text, "\n");
+    priv->seen_content = FALSE;
   }
 
-  /* Links: Images */
-
-  if (g_strcmp0 (element_name, "img")==0)
-  {
-    priv->html = g_string_append (priv->html, "<img ");
-
-    xmlChar *attribute;
-
-    attribute = xmlTextReaderGetAttribute (priv->inner, BAD_CAST "id");
-    if (attribute)
-    {
-      priv->html = g_string_append (priv->html, "id=\"");
-      priv->html = g_string_append (priv->html, (gchar*) attribute);
-      priv->html = g_string_append (priv->html, "\"");
-      xmlFree (attribute);
-      attribute = NULL;
-    }
-
-    attribute = xmlTextReaderGetAttribute (priv->inner, BAD_CAST "width");
-    if (attribute)
-    {
-      priv->html = g_string_append (priv->html, "width=\"");
-      priv->html = g_string_append (priv->html, (gchar*) attribute);
-      priv->html = g_string_append (priv->html, "\"");
-      xmlFree (attribute);
-      attribute = NULL;
-    }
-
-    attribute = xmlTextReaderGetAttribute (priv->inner, BAD_CAST "height");
-    if (attribute)
-    {
-      priv->html = g_string_append (priv->html, "height=\"");
-      priv->html = g_string_append (priv->html, (gchar*) attribute);
-      priv->html = g_string_append (priv->html, "\"");
-      xmlFree (attribute);
-      attribute = NULL;
-    }
-
-    attribute = xmlTextReaderGetAttribute (priv->inner, BAD_CAST "src");
-    if (attribute)
-    {
-      priv->html = g_string_append (priv->html, "src=\"");
-      priv->html = g_string_append (priv->html, (gchar*) attribute);
-      priv->html = g_string_append (priv->html, "\"");
-      xmlFree (attribute);
-      attribute = NULL;
-    }
-
-    attribute = xmlTextReaderGetAttribute (priv->inner, BAD_CAST "alt");
-    if (attribute)
-    {
-      priv->html = g_string_append (priv->html, "alt=\"");
-      priv->html = g_string_append (priv->html, (gchar*) attribute);
-      priv->html = g_string_append (priv->html, "\"");
-      xmlFree (attribute);
-      attribute = NULL;
-    }
-
-    priv->html = g_string_append (priv->html, ">");
-  }
-}
-
-static void
-process_bijiben_end_elem (BijiLazyDeserializer *self)
-{
-  BijiLazyDeserializerPrivate *priv = self->priv;
-  const gchar *element_name;
-  
-  element_name = (const gchar *) xmlTextReaderConstName (priv->inner);
-
-  if (g_strcmp0 (element_name, "note-content")==0)
-    return;
-
-  if (g_strcmp0 (element_name, "b")==0)
-    priv->html = g_string_append (priv->html, "</b>");
-
-  if (g_strcmp0 (element_name, "i")==0)
-    priv->html = g_string_append (priv->html, "</i>");
-
-  if (g_strcmp0 (element_name, "strike")==0)
-    priv->html = g_string_append (priv->html, "</strike>");
-
-  if (g_strcmp0 (element_name, "ul")==0)
-    priv->html = g_string_append (priv->html, "</ul>");
-
-  if (g_strcmp0 (element_name, "ol")==0)
-    priv->html = g_string_append (priv->html, "</ol>");
-
-  if (g_strcmp0 (element_name, "li")==0)
-  {
-    priv->html = g_string_append (priv->html, "</li>");
-    priv->raw_text = g_string_append (priv->raw_text, "\n");
-  }
+  if (g_strcmp0 (element_name, "li") == 0)
+    g_string_append (priv->raw_text, "- ");
 }
 
 static void
@@ -454,74 +361,68 @@ process_bijiben_text_elem (BijiLazyDeserializer *self)
 
   if (text)
   {
-    /* Simply append the text to both raw & html */
-    priv->html = g_string_append (priv->html, text);
     priv->raw_text = g_string_append (priv->raw_text, text);
+    priv->seen_content = TRUE;
   }
 }
 
 static void
 process_bijiben_node (BijiLazyDeserializer *self)
 {
-  int            type;
+  int type;
   const xmlChar *name ;
   BijiLazyDeserializerPrivate *priv = self->priv;
 
   type  = xmlTextReaderNodeType (priv->inner);
   name  = xmlTextReaderConstName (priv->inner);
-  
+
   if (name == NULL)
     name = BAD_CAST "(NULL)";
 
   switch (type)
-  {
-    case XML_ELEMENT_NODE:
+    {
+    case XML_READER_TYPE_ELEMENT:
       process_bijiben_start_elem (self);
       break;
 
-    case XML_ELEMENT_DECL:
-      process_bijiben_end_elem (self);
-      break;
-
-    case XML_TEXT_NODE:
-    case XML_DTD_NODE:
+    case XML_READER_TYPE_TEXT:
       process_bijiben_text_elem (self);
       break;
+
+    default:
+      /* Ignore other node types (and ignore
+         gcc warnings */
+      ;
   }
 }
 
 static void
-process_bijiben_html_content (BijiLazyDeserializer *self)
+process_bijiben_html_content (BijiLazyDeserializer *self,
+                              xmlTextReaderPtr      reader)
 {
   BijiLazyDeserializerPrivate *priv = self->priv;
   int ret;
-  gchar *sane_text, *sane_html;
+  gchar *sane_html;
 
-  priv->inner = xmlReaderForMemory (priv->content,
-                                    strlen(priv->content),
+  sane_html = (gchar*) xmlTextReaderReadInnerXml (reader);
+
+  priv->inner = xmlReaderForMemory (sane_html,
+                                    strlen(sane_html),
                                     "", "UTF-8", 0);
-  
+
   ret = xmlTextReaderRead (priv->inner);
 
   /* Make the GString grow as we read */
-  while (ret == 1) 
+  while (ret == 1)
   {
     process_bijiben_node (self);
     ret = xmlTextReaderRead (priv->inner);
   }
 
-  /* Now the inner content is known, we can
-   * assign note values and let deserialization work on last elements*/
-  sane_html = biji_str_mass_replace (priv->html->str, "&#xA;" , "<br/>",
-                                                      "&amp;" , "&"      , NULL);
-  sane_text = biji_str_mass_replace (priv->raw_text->str, "&#xA;", "    ",
-                                                          "&amp;", "&"   , NULL);
-
-  biji_note_obj_set_raw_text (priv->note, sane_text);
+  biji_note_obj_set_raw_text (priv->note, priv->raw_text->str);
   biji_note_obj_set_html_content (priv->note, sane_html);
 
-  g_free (sane_text);
-  g_free (sane_html);
+  xmlFree (BAD_CAST sane_html);
 }
 
 /* Common XML format for both Bijiben / Tomboy */
@@ -545,8 +446,7 @@ processNode (BijiLazyDeserializer *self)
   {
     if (self->priv->type == BIJIBEN_1)
     {
-      self->priv->content = (gchar*) xmlTextReaderReadInnerXml (r);
-      process_bijiben_html_content (self);
+      process_bijiben_html_content (self, r);
     }
 
     else if (self->priv->type == TOMBOY_1 ||
diff --git a/src/libbiji/editor/biji-webkit-editor.c b/src/libbiji/editor/biji-webkit-editor.c
index 789c231..584fe49 100644
--- a/src/libbiji/editor/biji-webkit-editor.c
+++ b/src/libbiji/editor/biji-webkit-editor.c
@@ -248,8 +248,8 @@ on_content_changed (WebKitWebView *view)
 
   /* First html serializing */
   dom = webkit_web_view_get_dom_document (view);
-  elem = webkit_dom_document_get_body (dom);
-  html = webkit_dom_html_element_get_inner_html (elem);
+  elem = WEBKIT_DOM_HTML_ELEMENT (webkit_dom_document_get_document_element (dom));
+  html = webkit_dom_html_element_get_outer_html (elem);
   text = webkit_dom_html_element_get_inner_text (elem);
 
   biji_note_obj_set_html_content (note, html);
@@ -261,22 +261,19 @@ on_content_changed (WebKitWebView *view)
   {
     gchar **rows;
 
-    rows = g_strsplit (html, "<div", 2);
-    g_warning ("title is %s", rows[0]);
+    rows = g_strsplit (text, "\n", 2);
 
-    /* if we have a carriage return and thus, a proper title
-     * we still need to ensure it's clean and unique */
-    if (g_strv_length (rows) > 1)
+    /* if we have a line feed, we have a proper title */
+    /* this is equivalent to g_strv_length (rows) > 1 */
+    if (rows && rows[0] && rows[1])
     {
-      gchar *sane_title, *unique_title;
-
-      sane_title = biji_str_mass_replace (rows[0],
-                                          "&nbsp;", "",
-                                          NULL);
+      char *title;
+      char *unique_title;
 
+      title = rows[0];
       unique_title = biji_note_book_get_unique_title (biji_note_obj_get_note_book (note),
-                                                      sane_title);
-      g_free (sane_title);
+                                                      title);
+
       biji_note_obj_set_title (note, unique_title);
       g_free (unique_title);
     }
@@ -315,9 +312,9 @@ biji_webkit_editor_constructed (GObject *obj)
 
   body = biji_note_obj_get_html (priv->note);
   if (!body)
-    body = "";
+    body = "<html xmlns=\"http://www.w3.org/1999/xhtml\";><body></body></html>";
 
-  webkit_web_view_load_string (view, body, NULL, NULL, NULL);
+  webkit_web_view_load_string (view, body, "application/xhtml+xml", NULL, NULL);
 
   /* Drag n drop */
   GtkTargetList *targets = webkit_web_view_get_copy_target_list (view);
diff --git a/src/libbiji/serializer/biji-lazy-serializer.c b/src/libbiji/serializer/biji-lazy-serializer.c
index 975f124..bcf4a23 100644
--- a/src/libbiji/serializer/biji-lazy-serializer.c
+++ b/src/libbiji/serializer/biji-lazy-serializer.c
@@ -150,133 +150,6 @@ serialize_tags (gchar *tag, xmlTextWriterPtr writer)
 }
 
 static void
-process_text_elem (BijiLazySerializer *self)
-{
-  BijiLazySerializerPrivate *priv = self->priv;
-
-  xmlTextWriterWriteRaw (priv->writer, xmlTextReaderConstValue (priv->inner));
-}
-
-static void
-process_start_elem (BijiLazySerializer *self)
-{
-  BijiLazySerializerPrivate *priv = self->priv;
-  const gchar *name;
-
-  name = (const gchar *) xmlTextReaderConstName(priv->inner);
-
-  if (g_strcmp0 (name, "b")==0)
-    xmlTextWriterStartElement (priv->writer, BAD_CAST "b");
-
-  else if (g_strcmp0 (name, "i")==0)
-    xmlTextWriterStartElement (priv->writer, BAD_CAST "i");
-
-  else if (g_strcmp0 (name, "strike")==0)
-    xmlTextWriterStartElement (priv->writer, BAD_CAST "strike");
-
-  /* Do not serialize div. Close br. Everything is <br/>. Simpler. */
-  else if (g_strcmp0 (name, "div")== 0 || g_strcmp0 (name, "br") == 0)
-  {
-    xmlTextWriterStartElement (priv->writer, BAD_CAST "br");
-    xmlTextWriterEndElement (priv->writer);
-  }
-
-  /* Lists */
-
-  else if (g_strcmp0 (name, "ul")==0)
-    xmlTextWriterStartElement (priv->writer, BAD_CAST "ul");
-
-  else if (g_strcmp0 (name, "ol")==0)
-    xmlTextWriterStartElement (priv->writer, BAD_CAST "ol");
-
-  else if (g_strcmp0 (name, "li")==0)
-    xmlTextWriterStartElement (priv->writer, BAD_CAST "li");
-
-  /* Links : Images
-   * width heigth src alt */
-  if (g_strcmp0 (name, "img")==0)
-  {
-    xmlTextWriterStartElement (priv->writer, BAD_CAST "img");
-
-    xmlChar *attribute = NULL;
-
-    attribute = xmlTextReaderGetAttribute (priv->inner, BAD_CAST "id");
-    if (attribute)
-    {
-      xmlTextWriterWriteAttribute (priv->writer, BAD_CAST "id", attribute);
-      xmlFree (attribute);
-      attribute = NULL;
-    }
-
-    attribute = xmlTextReaderGetAttribute (priv->inner, BAD_CAST "width");
-    if (attribute)
-    {
-      xmlTextWriterWriteAttribute (priv->writer, BAD_CAST "width", attribute);
-      xmlFree (attribute);
-      attribute = NULL;
-    }
-
-    attribute = xmlTextReaderGetAttribute (priv->inner, BAD_CAST "height");
-    if (attribute)
-    {
-      xmlTextWriterWriteAttribute (priv->writer, BAD_CAST "height", attribute);
-      xmlFree (attribute);
-      attribute = NULL;
-    }
-
-    attribute = xmlTextReaderGetAttribute (priv->inner, BAD_CAST "src");
-    if (attribute)
-    {
-      xmlTextWriterWriteAttribute (priv->writer, BAD_CAST "src", attribute);
-      xmlFree (attribute);
-      attribute = NULL;
-    }
-
-    attribute = xmlTextReaderGetAttribute (priv->inner, BAD_CAST "alt");
-    if (attribute)
-    {
-      xmlTextWriterWriteAttribute (priv->writer, BAD_CAST "alt", attribute);
-      xmlFree (attribute);
-      attribute = NULL;
-    }
-
-    xmlTextWriterEndElement (priv->writer);
-  }
-}
-
-static void
-process_end_elem (BijiLazySerializer *self)
-{
-  BijiLazySerializerPrivate *priv = self->priv;
-  const gchar *element_name;
-
-  element_name = (const gchar *) xmlTextReaderConstName(priv->inner);
-
-  if (g_strcmp0 (element_name, "b")==0)
-    xmlTextWriterEndElement (priv->writer);
-
-  if (g_strcmp0 (element_name, "i")==0)
-    xmlTextWriterEndElement (priv->writer);
-
-  if (g_strcmp0 (element_name, "strike")==0)
-    xmlTextWriterEndElement (priv->writer);
-
-  /* Lists */
-
-  if (g_strcmp0 (element_name, "ul")==0)
-    xmlTextWriterEndElement (priv->writer);
-
-  if (g_strcmp0 (element_name, "ol")==0)
-    xmlTextWriterEndElement (priv->writer);
-
-  if (g_strcmp0 (element_name, "li")==0)
-    xmlTextWriterEndElement (priv->writer);
-}
-
-/* Webkit html is _not_ xhtml, but we need valid xml here.
- * Thus some start elem are to be manually close.
- * until XHTML rules the world. or until we use .html note format... */
-static void
 serialize_html (BijiLazySerializer *self)
 {
   BijiLazySerializerPrivate *priv = self->priv;
@@ -285,45 +158,7 @@ serialize_html (BijiLazySerializer *self)
   if (!html)
     return;
 
-  /* We need a start & end node to obtain a Xml reader
-   * and we need to suffer html = recover from errors */
-  GString * noded_html = g_string_new ("<bijihtml>");
-  noded_html = g_string_append (noded_html, html);
-  noded_html = g_string_append (noded_html, "</bijihtml>");
-
-  priv->inner = xmlReaderForMemory (noded_html->str,
-                                    strlen(noded_html->str),
-                                    "", "UTF-8", XML_PARSE_RECOVER);
-
-  while (xmlTextReaderRead (priv->inner) ==1)
-  {
-    gint type = xmlTextReaderNodeType (priv->inner);
-    const xmlChar *name = xmlTextReaderConstName (priv->inner);
-
-    if (!name)
-      continue;
-
-    switch (type)
-    {
-      case XML_ELEMENT_NODE:
-        process_start_elem (self);
-        break;
-
-      case XML_ELEMENT_DECL:
-        process_end_elem (self);
-        break;
-
-      case XML_TEXT_NODE:
-        process_text_elem (self);
-        break;
-
-      case XML_DTD_NODE:
-        process_text_elem (self);
-        break;
-    }
-  }
-
-  g_string_free (noded_html, TRUE);
+  xmlTextWriterWriteRaw(priv->writer, BAD_CAST html);
 }
 
 gboolean
@@ -359,12 +194,7 @@ biji_lazy_serialize_internal (BijiLazySerializer *self)
   xmlTextWriterWriteAttributeNS(priv->writer, BAD_CAST "xml",
                                 BAD_CAST "space", NULL, 
                                 BAD_CAST "preserve");
-
-  // <note-content>
-  xmlTextWriterStartElement(priv->writer, BAD_CAST "note-content");
   serialize_html (self);
-  xmlTextWriterEndElement (priv->writer);
-
   // </text>  
   xmlTextWriterEndElement(priv->writer);
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]