[Banshee-List] utf8 validation



Hello everyone,

The attached patch adds a new public class (UnicodeValidator) to
Entagged. It has one public function (ValidateUTF8) which checks if
the given byte array can be validated as UTF8.

Also, id3v1 tag reader now tries to read the tag fields as utf8.

With this patch, I can view all Turkish language specific characters
in id3v1 tags.
? validate_id3v1.patch
? burn-sharp/.deps
? burn-sharp/.libs
? burn-sharp/glue.lo
? burn-sharp/libnautilusburnglue.la
? libbanshee/.deps
? libbanshee/.libs
? libbanshee/cd-detect.lo
? libbanshee/cd-rip.lo
? libbanshee/gst-encode.lo
? libbanshee/gst-init.lo
? libbanshee/gst-misc.lo
? libbanshee/gst-player-engine.lo
? libbanshee/inotify-glue.lo
? libbanshee/libbanshee.la
? libbanshee/xing/.deps
? po/.intltool-merge-cache
Index: entagged-sharp/EncodingInfo.cs
===================================================================
RCS file: /cvs/gnome/banshee/entagged-sharp/EncodingInfo.cs,v
retrieving revision 1.5
diff -p -u -2 -r1.5 EncodingInfo.cs
--- entagged-sharp/EncodingInfo.cs	1 Nov 2005 23:32:01 -0000	1.5
+++ entagged-sharp/EncodingInfo.cs	19 Nov 2005 10:42:39 -0000
@@ -109,3 +109,71 @@ public class EncodingInfo {
 	}
 }
+
+public class UnicodeValidator {
+
+    public static bool ValidateUTF8(byte[] str) 
+    {
+        int i;
+        int min = 0, val = 0;
+        
+        for(i = 0; i < str.Length; i++) {
+            if(str[i] < 128)
+                continue;
+            
+            if((str[i] & 0xe0) == 0xc0) { /* 110xxxxx */
+                if((str[i] & 0x1e) == 0)
+                    return false;
+                i++;
+                if((str[i] & 0xc0) != 0x80)  /* 10xxxxxx */
+                    return false;
+            } else {
+                if((str[i] & 0xf0) == 0xe0) { /* 1110xxxx */
+                    min = (1 << 11);
+                    val = str[i] & 0x0f;
+                    goto TWO_REMAINING;
+                } else if((str[i] & 0xf8) == 0xf0) { /* 11110xxx */
+                    min = (1 << 16);
+        		    val = str[i] & 0x07;  
+                } else {
+                    return false;
+                }
+                i++;
+                if(!continuation_char(str, i, ref val))
+                    return false;
+TWO_REMAINING:
+                i++;
+                if(!continuation_char(str, i, ref val))
+                    return false;
+                i++;
+                if(!continuation_char(str, i, ref val))
+                    return false;
+                
+                if(val < min || !unicode_valid((byte)val))
+                    return false;
+            }
+        }
+        
+        return true;
+    }
+        
+    private static bool continuation_char(byte[] str, int i, ref int val)
+    {
+        if ((str[i] & 0xc0) != 0x80) /* 10xxxxxx */
+            return false;                                     
+
+        val <<= 6;                                        
+        val |= str[i] & 0x3f;
+        
+        return true;               
+    }
+    
+    private static bool unicode_valid(int b)
+    {
+        return (b < 0x110000 &&                     
+                ((b & 0xFFFFF800) != 0xD800) &&     
+                (b < 0xFDD0 || b > 0xFDEF) &&  
+                (b & 0xFFFE) != 0xFFFE);
+    }
+}
+
 }
Index: entagged-sharp/Mp3/Util/Id3v1TagReader.cs
===================================================================
RCS file: /cvs/gnome/banshee/entagged-sharp/Mp3/Util/Id3v1TagReader.cs,v
retrieving revision 1.5
diff -p -u -2 -r1.5 Id3v1TagReader.cs
--- entagged-sharp/Mp3/Util/Id3v1TagReader.cs	1 Nov 2005 23:32:03 -0000	1.5
+++ entagged-sharp/Mp3/Util/Id3v1TagReader.cs	19 Nov 2005 10:42:40 -0000
@@ -80,5 +80,10 @@ namespace Entagged.Audioformats.Mp3.Util
 			byte[] b = new byte[length];
 			mp3Stream.Read( b, 0, b.Length );
-			string ret = Encoding.GetEncoding("ISO-8859-1").GetString(b).Trim();
+			string ret;
+			
+            if(Entagged.Audioformats.UnicodeValidator.ValidateUTF8(b))
+                ret = Encoding.UTF8.GetString(b).Trim();
+            else 
+                ret = Encoding.GetEncoding("ISO-8859-1").GetString(b).Trim();
 
 			int pos = ret.IndexOf('\0');






[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]