[gdome]Charsed encoding issues solved



Hello!

I finally traced the cause of the encoding bug I was experiencing: I
discovered that the DOMString::toC() method is converting data to latin1
format.  ARGH!

I was using toC() to read data from XML and the DOMString constructor to
create DOMStrings to put it back into new nodes.  This caused a wrong
charset conversion to come in.

I *really*, *really* have liked some documentation about the fact that I
should use toUTF8() to retrive strings that I could then pass to the
DOMString constructor.  This is made even more unclear by the fact that
toC() returns the same character type that the DOMString constructor
expects.

Please, please add some comment about this being wrong:

DOMString a(anotherdomstring.toC());

and this being the correct way to do it:

DOMString a(anotherdomstring.toUTF8());

as I lost too much time chasing this simple issue.  Or else, make the
DOMString constructor take a Char8* instead of a char*.

Attached to this mail is a c++ program demonstrating the mess.  As you
can see, the source seems perfectly fine.


Bye, Enrico

--
GPG key: 1024D/797EBFAB 2000-12-05 Enrico Zini <enrico debian org>
#include <GdomeSmartDOM.hh>

#include <stdio.h>
#include <string>
#include <vector>

using namespace GdomeSmartDOM;
using namespace std;

// Return the title string found inside the element
string getText(const Node& el)
{
	string res;
	NodeList texts = el.get_childNodes();
	for (unsigned long i = 0; i < texts.get_length(); i++)
	{
		Node node = texts.item(i);
		if (node.get_nodeType() == Node::TEXT_NODE)
		{
			if (res.size() > 0)
				fprintf(stderr, "duplicate text: prev: %.*s, found: %s.  Replacing old value.\n",
						res.size(), res.data(), node.get_nodeValue().toC());
			res = node.get_nodeValue().toC();
		} else {
			fprintf(stderr, "Unhandled node (type %d): %s\n", node.get_nodeType(), node.get_nodeName().toC());
		}
	}
	return res;
}

// Get the Title associated with this Node
string getTitle(const Node& el)
{
	string res;
	NodeList children = el.get_childNodes();
	for (unsigned long i = 0; i < children.get_length(); i++)
	{
		Node node(children.item(i));
		if (string("title") == node.get_nodeName().toC())
			return getText(node);
	}
	return "";
}

Node createNode(Document doc, const string& name)
{
	// Create a new folder with the chosen tag
	Element res = doc.createElement("test");

	// Add the name as title
	Text titleText = doc.createTextNode(name.c_str());
	Element titleElement = doc.createElement("title");
	titleElement.appendChild(titleText);
	res.appendChild(titleElement);

	return res;
}

int main(int argc, const char* argv[])
{
	const char* file_in = argv[1];
	const char* file_out = argv[2];

	try {
		DOMImplementation impl;
		Document doc = impl.createDocumentFromURI(file_in);

		fprintf(stderr, "Reading data...\n");

		Node top = doc.get_childNodes().item(0);
		NodeList list = top.get_childNodes();
		vector<Node> newnodes;
		for (unsigned long i = 0; i < list.get_length(); i++)
		{
			Node node = list.item(i);
			newnodes.push_back(node.cloneNode(true));
			newnodes.push_back(createNode(doc, getTitle(node)));
		}

		fprintf(stderr, "Appening test nodes...\n");

		for (vector<Node>::iterator i = newnodes.begin();
				i != newnodes.end(); i++)
			top.appendChild(*i);

		fprintf(stderr, "Saving new document...\n");
		impl.saveDocumentToFile(doc, file_out);

		fprintf(stderr, "Done.\n");
	} catch (DOMException& e) {
		fprintf(stderr, "DOMException: %s (%d)\n", e.msg.toC(), e.code);
	} catch (...) {
		fprintf(stderr, "Other exception\n");
	}
	
	return 0;
}

// vim:set ts=4 sw=4:
<?xml version="1.0" encoding="utf8" ?>
<tests>
	<test> <title>Universitā!</title> </test>
	<test> <title>Härnösands kommun, Index</title> </test>
	<test> <title>Kårhuset</title> </test>
</tests>


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]