[Banshee-List] Re: utf8 validation



On 11/19/05, Dogacan Guney <dogacan gmail com> wrote:
> Hello everyone,
>
> The attached patch adds a new public class (UnicodeValidator) to
> Entagged. It has one public function (ValidateUTF8) which checks if
> the given byte array can be validated as UTF8.
>
> Also, id3v1 tag reader now tries to read the tag fields as utf8.
>
> With this patch, I can view all Turkish language specific characters
> in id3v1 tags.
>
>
>

validateutf8 had some bugs. This new one should fix it.
? validate_id3v1.patch
? validate_id3v1_ver2.patch
? burn-sharp/.deps
? burn-sharp/.libs
? burn-sharp/glue.lo
? burn-sharp/libnautilusburnglue.la
? libbanshee/.deps
? libbanshee/.libs
? libbanshee/cd-detect.lo
? libbanshee/cd-rip.lo
? libbanshee/gst-encode.lo
? libbanshee/gst-init.lo
? libbanshee/gst-misc.lo
? libbanshee/gst-player-engine.lo
? libbanshee/inotify-glue.lo
? libbanshee/libbanshee.la
? libbanshee/xing/.deps
? po/.intltool-merge-cache
Index: entagged-sharp/EncodingInfo.cs
===================================================================
RCS file: /cvs/gnome/banshee/entagged-sharp/EncodingInfo.cs,v
retrieving revision 1.5
diff -p -u -2 -r1.5 EncodingInfo.cs
--- entagged-sharp/EncodingInfo.cs	1 Nov 2005 23:32:01 -0000	1.5
+++ entagged-sharp/EncodingInfo.cs	19 Nov 2005 19:47:05 -0000
@@ -109,3 +109,75 @@ public class EncodingInfo {
 	}
 }
+
+public class UnicodeValidator {
+
+    public static bool ValidateUTF8(byte[] str) 
+    {
+        int i;
+        int min = 0, val = 0;
+        
+        try {
+            for(i = 0; i < str.Length; i++) {
+                if(str[i] < 128)
+                    continue;
+                
+                if((str[i] & 0xe0) == 0xc0) { /* 110xxxxx */
+                    if((str[i] & 0x1e) == 0)
+                        return false;
+                    i++;
+                    if((str[i] & 0xc0) != 0x80)  /* 10xxxxxx */
+                        return false;
+                } else {
+                    if((str[i] & 0xf0) == 0xe0) { /* 1110xxxx */
+                        min = (1 << 11);
+                        val = str[i] & 0x0f;
+                        goto TWO_REMAINING;
+                    } else if((str[i] & 0xf8) == 0xf0) { /* 11110xxx */
+                        min = (1 << 16);
+            		    val = str[i] & 0x07;  
+                    } else {
+                        return false;
+                    }
+                    i++;
+                    if(!continuation_char(str, i, ref val))
+                        return false;
+    TWO_REMAINING:
+                    i++;
+                    if(!continuation_char(str, i, ref val))
+                        return false;
+                    i++;
+                    if(!continuation_char(str, i, ref val))
+                        return false;
+                    
+                    if(val < min || !unicode_valid(val))
+                        return false;
+                }
+            }
+        } catch (System.IndexOutOfRangeException e) {
+            return false;
+        }
+
+        return true;
+    }
+        
+    private static bool continuation_char(byte[] str, int i, ref int val)
+    {
+        if ((str[i] & 0xc0) != 0x80) /* 10xxxxxx */
+            return false;                                     
+
+        val <<= 6;                                        
+        val |= str[i] & 0x3f;
+        
+        return true;               
+    }
+    
+    private static bool unicode_valid(int b)
+    {
+        return (b < 0x110000 &&                     
+                ((b & 0xFFFFF800) != 0xD800) &&     
+                (b < 0xFDD0 || b > 0xFDEF) &&  
+                (b & 0xFFFE) != 0xFFFE);
+    }
+}
+
 }
Index: entagged-sharp/Mp3/Util/Id3v1TagReader.cs
===================================================================
RCS file: /cvs/gnome/banshee/entagged-sharp/Mp3/Util/Id3v1TagReader.cs,v
retrieving revision 1.5
diff -p -u -2 -r1.5 Id3v1TagReader.cs
--- entagged-sharp/Mp3/Util/Id3v1TagReader.cs	1 Nov 2005 23:32:03 -0000	1.5
+++ entagged-sharp/Mp3/Util/Id3v1TagReader.cs	19 Nov 2005 19:47:06 -0000
@@ -80,5 +80,10 @@ namespace Entagged.Audioformats.Mp3.Util
 			byte[] b = new byte[length];
 			mp3Stream.Read( b, 0, b.Length );
-			string ret = Encoding.GetEncoding("ISO-8859-1").GetString(b).Trim();
+			string ret;
+			
+            if(Entagged.Audioformats.UnicodeValidator.ValidateUTF8(b))
+                ret = Encoding.UTF8.GetString(b).Trim();
+            else 
+                ret = Encoding.GetEncoding("ISO-8859-1").GetString(b).Trim();
 
 			int pos = ret.IndexOf('\0');
Index: src/Banshee.Widgets/Makefile.in
===================================================================
RCS file: /cvs/gnome/banshee/src/Banshee.Widgets/Makefile.in,v
retrieving revision 1.1
diff -p -u -2 -r1.1 Makefile.in
--- src/Banshee.Widgets/Makefile.in	17 Nov 2005 09:09:51 -0000	1.1
+++ src/Banshee.Widgets/Makefile.in	19 Nov 2005 19:47:11 -0000
@@ -1,3 +1,3 @@
-# Makefile.in generated by automake 1.9.6 from Makefile.am.
+# Makefile.in generated by automake 1.9.5 from Makefile.am.
 # @configure_input@
 
@@ -173,4 +173,5 @@ MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_
 MAKEINFO = @MAKEINFO@
 MCS = @MCS@
+MKINSTALLDIRS = @MKINSTALLDIRS@
 MONO = @MONO@
 MONO_CFLAGS = @MONO_CFLAGS@


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]