[tracker/tracker-0.6] Read mp3 files in parts in extraction



commit 1e479746a6fc6062a0c0e2d37b74e19ec7f9c95a
Author: Mikael Ottela <mikael ottela ixonos com>
Date:   Mon Apr 20 23:05:02 2009 +0300

    Read mp3 files in parts in extraction
    
    We now read last 128 bytes for id3v1 metadata separately from mmap of the mp3
    file in mp3 extraction for better performance. We now handle large mp3 files
    without problems.
    
    Fixes NB#111560
---
 src/tracker-extract/tracker-extract-mp3.c |  156 ++++++++++++++++++++++------
 1 files changed, 122 insertions(+), 34 deletions(-)

diff --git a/src/tracker-extract/tracker-extract-mp3.c b/src/tracker-extract/tracker-extract-mp3.c
index fa6425e..8cf2549 100644
--- a/src/tracker-extract/tracker-extract-mp3.c
+++ b/src/tracker-extract/tracker-extract-mp3.c
@@ -31,6 +31,7 @@
 #include <fcntl.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <errno.h>
 
 #include <glib.h>
 #include <glib/gstdio.h>
@@ -46,15 +47,22 @@
 #include "tracker-extract-albumart.h"
 #include "tracker-escape.h"
 
-/* FIXME The max file read is not a good idea as basic 
- * id3 are the _last_ 128 bits of the file. We should
- * probably read 2 buffers (beginning, end) instead.
+/* We mmap the beginning of the file and read separately the last 128 bytes
+   for id3v1 tags. While these are probably cornercases the rationale is that
+   we don't want to fault a whole page for the last 128 bytes and on the other
+   we don't want to mmap the whole file with unlimited size (might need to create
+   private copy in some special cases, finding continuous space etc). We now take
+   5 first MB of the file and assume that this is enough. In theory there is no
+   maximum size as someone could embed 50 gigabytes of albumart there.
 */
-#define MAX_FILE_READ	  1024 * 1024 * 20
+
+#define MAX_FILE_READ	  1024 * 1024 * 5
 #define MAX_MP3_SCAN_DEEP 16768
 
-#define MAX_FRAMES_SCAN   1024 * 3
-#define VBR_THRESHOLD     64
+#define MAX_FRAMES_SCAN   512
+#define VBR_THRESHOLD     16
+
+#define ID3V1_SIZE        128
 
 typedef struct {
 	const gchar *text;
@@ -72,7 +80,10 @@ typedef struct {
 } id3tag;
 
 typedef struct {
-	size_t         audio_offset;
+	size_t         size;
+	size_t         id3v2_size;
+
+	guint32        duration;
 
 	unsigned char *albumartdata;
 	size_t         albumartsize;
@@ -285,6 +296,45 @@ static TrackerExtractData extract_data[] = {
 	{ NULL, NULL }
 };
 
+static char *
+read_id3v1_buffer (int fd, goffset size)
+{
+	char *buffer;
+	guint bytes_read;
+	guint rc;
+
+	buffer = g_malloc (ID3V1_SIZE);
+
+	if (!buffer) {
+		return NULL;
+	}
+
+	if (lseek (fd, size-ID3V1_SIZE, SEEK_SET) < 0) {
+		g_free (buffer);
+		return NULL;
+	}
+
+	bytes_read = 0;
+	
+	while (bytes_read < ID3V1_SIZE) {
+		rc = read(fd,
+			  buffer + bytes_read,
+			  ID3V1_SIZE - bytes_read);
+		if (rc == -1) {
+			if (errno != EINTR) {
+				g_free (buffer);
+				return NULL;
+			}
+		}
+		else if (rc == 0)
+			break;
+		else
+			bytes_read += rc;
+	}
+	
+	return buffer;
+}
+
 /* Convert from UCS-2 to UTF-8 checking the BOM.*/
 static gchar *
 ucs2_to_utf8 (const gchar *data, guint len) 
@@ -460,7 +510,8 @@ static gboolean
 mp3_parse_header (const gchar *data,
 		  size_t       size,
 		  size_t       seek_pos,
-		  GHashTable  *metadata)
+		  GHashTable  *metadata,
+		  file_data   *filedata)
 {
 	guint header;
 	gchar mpeg_ver = 0;
@@ -613,17 +664,20 @@ mp3_parse_header (const gchar *data,
 
 	avg_bps /= frames;
 
-	if ((!vbr_flag || frames > VBR_THRESHOLD) || (frames > MAX_FRAMES_SCAN)) {
-		/* If not all frames scanned */
-		length = size / (avg_bps ? avg_bps : bitrate ? bitrate : 0xFFFFFFFF) / 125;
-	} else{
-		length = 1152 * frames / (sample_rate ? sample_rate : 0xFFFFFFFF);
+	if (filedata->duration==0) {
+		if ((!vbr_flag || frames > VBR_THRESHOLD) || (frames > MAX_FRAMES_SCAN)) {
+			/* If not all frames scanned */
+			length = (filedata->size - filedata->id3v2_size) / (avg_bps ? avg_bps : bitrate ? bitrate : 0xFFFFFFFF) / 125;
+		} else{
+			length = 1152 * frames / (sample_rate ? sample_rate : 0xFFFFFFFF);
+		}
+ 
+		g_hash_table_insert (metadata,
+				     g_strdup ("Audio:Duration"),
+				     tracker_escape_metadata_printf ("%d", length));
 	}
 
 	g_hash_table_insert (metadata,
-			     g_strdup ("Audio:Duration"),
-			     tracker_escape_metadata_printf ("%d", length));
-	g_hash_table_insert (metadata,
 			     g_strdup ("Audio:Samplerate"),
 			     tracker_escape_metadata_printf ("%d", sample_rate));
 	g_hash_table_insert (metadata,
@@ -636,12 +690,13 @@ mp3_parse_header (const gchar *data,
 static void
 mp3_parse (const gchar *data,
 	   size_t       size,
+	   size_t       offset,
 	   GHashTable  *metadata,
 	   file_data   *filedata)
 {
 	guint header;
 	guint counter = 0;
-	guint pos = filedata->audio_offset;
+	guint pos = offset;
 
 	do {
 		/* Seek for frame start */
@@ -653,7 +708,7 @@ mp3_parse (const gchar *data,
 
 		if ((header & sync_mask) == sync_mask) {
 			/* Found header sync */
-			if (mp3_parse_header (data, size, pos, metadata)) {
+			if (mp3_parse_header (data, size, pos, metadata, filedata)) {
 				return;
 			}
 		}
@@ -690,6 +745,7 @@ get_id3v24_tags (const gchar *data,
 		{"TDRL", "Audio:ReleaseDate"},
 		{"TRCK", "Audio:TrackNo"},
 		{"PCNT", "Audio:PlayCount"},
+		{"TLEN", "Audio:Duration"},
 		{NULL, 0},
 	};
 
@@ -787,9 +843,7 @@ get_id3v24_tags (const gchar *data,
 						g_free (word);
 						word = g_strdup (parts[0]);
 						g_strfreev (parts);
-					}
-
-					if (strcmp (tmap[i].text, "TCON") == 0) {
+					} else if (strcmp (tmap[i].text, "TCON") == 0) {
 						gint genre;
 
 						if (get_genre_number (word, &genre)) {
@@ -800,7 +854,14 @@ get_id3v24_tags (const gchar *data,
 						if (strcasecmp (word, "unknown") == 0) {
 							break;
 						}
-					}					
+					} else if (strcmp (tmap[i].text, "TLEN") == 0) {
+						guint32 duration;
+
+						duration = atoi (word);
+						g_free (word);
+						word = g_strdup_printf ("%d", duration/1000);
+						filedata->duration = duration/1000;
+					}
 
 					g_hash_table_insert (metadata,
 							     g_strdup (tmap[i].type),
@@ -935,6 +996,7 @@ get_id3v23_tags (const gchar *data,
 		{"TYER", "Audio:ReleaseDate"},
 		{"TRCK", "Audio:TrackNo"},
 		{"PCNT", "Audio:PlayCount"},
+		{"TLEN", "Audio:Duration"},
 		{NULL, 0},
 	};
 
@@ -1023,9 +1085,7 @@ get_id3v23_tags (const gchar *data,
 						g_free (word);
 						word = g_strdup (parts[0]);
 						g_strfreev (parts);
-					}
-
-					if (strcmp (tmap[i].text, "TCON") == 0) {
+					} else if (strcmp (tmap[i].text, "TCON") == 0) {
 						gint genre;
 
 						if (get_genre_number (word, &genre)) {
@@ -1036,6 +1096,13 @@ get_id3v23_tags (const gchar *data,
 						if (strcasecmp (word, "unknown") == 0) {
 							break;
 						}
+					} else if (strcmp (tmap[i].text, "TLEN") == 0) {
+						guint32 duration;
+
+						duration = atoi (word);
+						g_free (word);
+						word =  g_strdup_printf ("%d", duration/1000);
+						filedata->duration = duration/1000;
 					}
 
 					g_hash_table_insert (metadata,
@@ -1167,6 +1234,7 @@ get_id3v20_tags (const gchar *data,
 		{"TOT", "Audio:Album"},
 		{"TOL", "Audio:Artist"},
 		{"COM", "Audio:Comment"},
+		{"TLE", "Audio:Duration"},
 		{ NULL, 0},
 	};
 
@@ -1247,8 +1315,16 @@ get_id3v20_tags (const gchar *data,
 						}
 						
 						if (strcasecmp (word, "unknown") == 0) {
+							g_free (word);
 							break;
 						}
+					} else if (strcmp (tmap[i].text, "TLE") == 0) {
+						guint32 duration;
+
+						duration = atoi (word);
+						g_free (word);
+						word = g_strdup_printf ("%d", duration/1000);
+						filedata->duration = duration/1000;
 					}	
 					
 					g_hash_table_insert (metadata,
@@ -1467,9 +1543,9 @@ parse_id3v20 (const gchar *data,
 	*offset_delta = tsize + 10;
 }
 
-static void
+static goffset
 parse_id3v2 (const gchar *data,
-	     size_t	     size,
+	     size_t	  size,
 	     GHashTable  *metadata,
 	     file_data   *filedata)
 {
@@ -1484,12 +1560,14 @@ parse_id3v2 (const gchar *data,
 
 		if (offset_delta == 0) {
 			done = TRUE;
-			filedata->audio_offset = offset;
+			filedata->id3v2_size = offset;
 		} else {
 			offset += offset_delta;
 		}
 
 	} while (!done);
+
+	return offset;
 }
 
 static void
@@ -1498,8 +1576,10 @@ extract_mp3 (const gchar *filename,
 {
 	int	     fd;
 	void	    *buffer;
+	void        *id3v1_buffer;
 	goffset      size;
 	id3tag	     info;
+	goffset      audio_offset;
 	file_data    filedata;
 
 	info.title = NULL;
@@ -1510,16 +1590,20 @@ extract_mp3 (const gchar *filename,
 	info.genre = NULL;
 	info.trackno = NULL;
 
-	filedata.audio_offset = 0;
+	filedata.size = 0;
+	filedata.id3v2_size = 0;
+	filedata.duration = 0;
 	filedata.albumartdata = NULL;
 	filedata.albumartsize = 0;
 
 	size = tracker_file_get_size (filename);
 
-	if (size == 0 || size > MAX_FILE_READ) {
+	if (size == 0) {
 		return;
 	}
 
+	filedata.size = size;
+
 #if defined(__linux__)
 	/* Can return -1 because of O_NOATIME, so we try again after
 	 * without as a last resort. This can happen due to
@@ -1550,13 +1634,15 @@ extract_mp3 (const gchar *filename,
 		       0);
 #endif
 
+	id3v1_buffer = read_id3v1_buffer (fd, size);
+
 	close (fd);
 
 	if (buffer == NULL || buffer == (void*) -1) {
 		return;
 	}
 
-	if (!get_id3 (buffer, size, &info)) {
+	if (!get_id3 (id3v1_buffer, ID3V1_SIZE, &info)) {
 		/* Do nothing? */
 	}
 
@@ -1611,10 +1697,10 @@ extract_mp3 (const gchar *filename,
 	g_free (info.genre);
 
 	/* Get other embedded tags */
-	parse_id3v2 (buffer, size, metadata, &filedata);
+	audio_offset = parse_id3v2 (buffer, MIN(size, MAX_FILE_READ), metadata, &filedata);
 
 	/* Get mp3 stream info */
-	mp3_parse (buffer, size, metadata, &filedata);
+	mp3_parse (buffer, MIN(size, MAX_FILE_READ), audio_offset, metadata, &filedata);
 
 #ifdef HAVE_GDKPIXBUF
 	tracker_process_albumart (filedata.albumartdata, filedata.albumartsize,
@@ -1654,8 +1740,10 @@ extract_mp3 (const gchar *filename,
 	}
 
 #ifndef G_OS_WIN32
-	munmap (buffer, size);
+	munmap (buffer, MIN(size, MAX_FILE_READ));
 #endif
+
+	g_free (id3v1_buffer);
 }
 
 TrackerExtractData *



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]