[sound-juicer/pwood/wip/fuzzy-toc-matching: 2/5] MusicBrainz: Use Fuzzy TOC matching
- From: Phillip Wood <pwood src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [sound-juicer/pwood/wip/fuzzy-toc-matching: 2/5] MusicBrainz: Use Fuzzy TOC matching
- Date: Tue, 23 Oct 2018 09:59:19 +0000 (UTC)
commit 12972f914e99e880b8758b83564518cd364ed985
Author: Phillip Wood <phillip wood dunelm org uk>
Date: Sun Jun 25 11:32:32 2017 +0100
MusicBrainz: Use Fuzzy TOC matching
When searching for the disc on MusicBrainz use fuzzy TOC matching so
that if the discid isn’t found but there is a album with the same
track lengths that is returned. Disable cdstub matches as we cannot
parse them at the moment. The fuzzy match will not have a matching
discid so match track lengths against the disc we’re ripping to find
out which disc in a release matches our disc.
https://bugzilla.gnome.org/show_bug.cgi?id=788425
libjuicer/sj-metadata-musicbrainz5.c | 266 ++++++++++++++++++++++++++++++-----
1 file changed, 229 insertions(+), 37 deletions(-)
---
diff --git a/libjuicer/sj-metadata-musicbrainz5.c b/libjuicer/sj-metadata-musicbrainz5.c
index ae152b8c..2ff352f4 100644
--- a/libjuicer/sj-metadata-musicbrainz5.c
+++ b/libjuicer/sj-metadata-musicbrainz5.c
@@ -40,11 +40,14 @@
typedef char Mbid[40];
typedef struct {
- gint count;
gchar *id;
gchar *mcn;
gchar *url;
+ gint release_count;
Mbid *release_ids;
+ gint sectors;
+ gint track_count;
+ gint *lengths;
} DiscDetails;
static char language[3];
@@ -126,6 +129,7 @@ disc_details_free (DiscDetails *disc)
g_free (disc->mcn);
g_free (disc->url);
g_free (disc->release_ids);
+ g_free (disc->lengths);
g_free (disc);
}
@@ -298,6 +302,10 @@ get_disc_md (SjMetadataMusicbrainz5 *self,
DiscId disc;
Mb5Metadata disc_md = NULL;
SjMetadataMusicbrainz5Private *priv = GET_PRIVATE (self);
+ char *names[2], *values[2];
+ char cdstubs[] = "cdstubs";
+ char toc[] = "toc";
+ int count = 0, first, i, length;
if (sj_metadata_helper_check_media (priv->cdrom, error) == FALSE) {
return NULL;
@@ -309,23 +317,64 @@ get_disc_md (SjMetadataMusicbrainz5 *self,
if (discid_read_sparse (disc, priv->cdrom, DISCID_FEATURE_MCN) == 0)
goto out;
- if (g_getenv("MUSICBRAINZ_FORCE_DISC_ID")) {
- discid = g_getenv("MUSICBRAINZ_FORCE_DISC_ID");
- } else {
+ if (g_getenv ("MUSICBRAINZ_FORCE_DISC_ID"))
+ discid = g_getenv ("MUSICBRAINZ_FORCE_DISC_ID");
+ else
discid = discid_get_id (disc);
+
+ /* Allow MUSICBRAINZ_FORCE_DISC_ID=- to force a fuzzy match with the
+ toc of the disc in the drive */
+ if (!g_getenv ("MUSICBRAINZ_FORCE_DISC_ID") ||
+ strcmp (discid, "-") == 0) {
+ names[count] = toc;
+ values[count] = g_strdup (discid_get_toc_string (disc));
+ count++;
}
if (g_cancellable_set_error_if_cancelled (cancellable, error))
goto out;
- disc_md = query_musicbrainz (self, "discid", discid, NULL, cancellable, error);
- disc_data->url = g_strdup (discid_get_submission_url (disc));
- disc_data->mcn = g_strdup (discid_get_mcn (disc));
+ names[count] = cdstubs;
+ values[count] = g_strdup ("no");
+ count++;
+ disc_md = query_musicbrainz_full (self, "discid", discid,
+ count, names, values, cancellable, error);
+
disc_data->id = g_strdup (discid);
- g_info ("Disc id %s\nSubmission URL %s\nDisc MCN %s",
- disc_data->id,
- disc_data->url,
- disc_data->mcn);
+ /* Only use the details of the disc in the drive if the discid
+ hasn't been forced */
+ if (!g_getenv ("MUSICBRAINZ_FORCE_DISC_ID") ||
+ strcmp (discid, "-") == 0) {
+ disc_data->url = g_strdup (discid_get_submission_url (disc));
+ disc_data->mcn = g_strdup (discid_get_mcn (disc));
+ disc_data->sectors = discid_get_sectors (disc);
+ first = discid_get_first_track_num (disc);
+ disc_data->track_count = 1 + discid_get_last_track_num (disc) - first;
+ disc_data->lengths = g_new (int, disc_data->track_count);
+ for (i = 0; i < disc_data->track_count; i++) {
+ length = discid_get_track_length (disc, i + first);
+ /* multiply by 1000 as we want the time in milliseconds
+ 80 min * 60 sec * 1000 ms * 75 sectors < 2^29 so 32 bit int ok */
+ length *= 1000;
+ length /= 75; /* divide by sectors / second */
+ disc_data->lengths[i] = length;
+ }
+ g_info ("Disc id %s\nSubmission URL %s\nDisc MCN %s",
+ disc_data->id,
+ disc_data->url,
+ disc_data->mcn);
+ } else {
+ disc_data->url = g_strdup ("");
+ disc_data->mcn = g_strdup ("");
+ disc_data->sectors = 0;
+ disc_data->track_count = 0;
+ disc_data->lengths = NULL;
+ g_info ("Forced discid %s", disc_data->id);
+ }
+
out:
+ for (i = 0; i < count; i++) {
+ g_free (values[i]);
+ }
discid_free (disc);
return disc_md;
}
@@ -356,28 +405,43 @@ filter_releases (Mb5Metadata disc_md,
DiscDetails *disc)
{
int i, j, count;
+ Mb5Disc mb_disc;
Mb5ReleaseList releases;
- releases = mb5_disc_get_releaselist (mb5_metadata_get_disc (disc_md));
- count = disc->count = mb5_release_list_size (releases);
- if (disc->count == 0)
+ /*
+ * If the discid matches a release MusicBrainz returns an Mb5Disc
+ * which contains an Mb5ReleaseList, if not the fuzzy match results
+ * are returned as an Mb5ReleaseList directly.
+ */
+ mb_disc = mb5_metadata_get_disc (disc_md);
+ if (mb_disc != NULL) {
+ g_info ("Exact discid match");
+ releases = mb5_disc_get_releaselist (mb_disc);
+ } else {
+ g_info ("Fuzzy TOC match");
+ releases = mb5_metadata_get_releaselist (disc_md);
+ }
+
+ count = disc->release_count = mb5_release_list_size (releases);
+ g_info ("%d matching releases", count);
+ if (disc->release_count == 0)
return;
- disc->release_ids = g_new0 (Mbid, disc->count);
+ disc->release_ids = g_new0 (Mbid, disc->release_count);
for (i = 0, j = 0; i < count; i++) {
Mb5Release release;
gchar barcode[16];
release = mb5_release_list_item (releases, i);
if (release == NULL) {
- disc->count--;
+ disc->release_count--;
continue;
}
mb5_release_get_id (release, disc->release_ids[j], sizeof(Mbid));
mb5_release_get_barcode (release, barcode, sizeof(barcode));
if (mcn_matches_barcode (disc->mcn, barcode)) {
- disc->count = 1;
+ disc->release_count = 1;
strcpy (disc->release_ids[0], disc->release_ids[j]);
break;
}
@@ -1122,6 +1186,140 @@ make_album_from_release (SjMetadataMusicbrainz5 *self,
return album;
}
+/*
+ * See if the discid of a medium matches the id of the disc being
+ * ripped. As the discid is a hash check the sector count as well as
+ * the discid to filter out discid collisions
+ */
+static gboolean
+match_discid (DiscDetails *disc_details,
+ Mb5Medium medium)
+{
+ Mb5DiscList disc_list;
+ Mb5Disc mb_disc;
+ char buffer[512];
+ int disc_count, i;
+ gboolean sector_match;
+
+ disc_list = mb5_medium_get_disclist (medium);
+ disc_count = mb5_disc_list_size (disc_list);
+ for (i = 0; i < disc_count; i++) {
+ mb_disc = mb5_disc_list_item (disc_list, i);
+ sector_match = disc_details->sectors ?
+ mb5_disc_get_sectors (mb_disc) == disc_details->sectors :
+ TRUE;
+ mb5_disc_get_id (mb_disc, buffer, sizeof(buffer));
+ if (*buffer && strcmp (buffer, disc_details->id) == 0)
+ return sector_match ? 1 : -1;
+ }
+ return 0;
+}
+
+/*
+ * Fuzzy track length matching. Calculate the sum of the squares of
+ * the difference between the track length on the disc and the track
+ * length reported by MusicBrainz. We use the square rather than abs()
+ * so one large difference is penalized more than several small ones.
+ */
+static guint
+track_delta_sum (DiscDetails *disc,
+ Mb5TrackList tracks)
+{
+ Mb5Recording recording;
+ Mb5Track track;
+ int count, i, length;
+ uint delta, sum;
+
+ sum = 0;
+ count = disc->track_count;
+ for (i = 0; i < count; i++) {
+ track = mb5_track_list_item (tracks, i);
+ length = mb5_track_get_length (track);
+ if (length == 0) {
+ recording = mb5_track_get_recording (track);
+ length = mb5_recording_get_length (recording);
+ }
+ delta = abs (disc->lengths[i] - length);
+ if (!g_uint_checked_mul (&delta, delta, delta)) {
+ sum = G_MAXUINT;
+ break;
+ }
+ if (!g_uint_checked_add (&sum, sum, delta)) {
+ sum = G_MAXUINT;
+ break;
+ }
+ }
+ return sum;
+}
+
+/*
+ * Return a GSList of media in the release that match the disc being
+ * ripped. If MusicBrainz returned a fuzzy match then the discid's of
+ * the media wont match the id of the disc being ripped so match on
+ * track lengths instead.
+ */
+static Mb5Medium
+get_matching_media (DiscDetails *disc,
+ Mb5Release release)
+{
+ Mb5MediumList media;
+ Mb5Medium medium;
+ Mb5TrackList tracks;
+ GSList *matches = NULL;
+ int i, medium_count, res;
+ uint best_sum, sum;
+
+ best_sum = G_MAXUINT;
+ media = mb5_release_get_mediumlist (release);
+ medium_count = mb5_medium_list_size (media);
+ for (i = 0; i < medium_count; i++) {
+ medium = mb5_medium_list_item (media, i);
+ tracks = mb5_medium_get_tracklist (medium);
+ /* Don't match track count with MUSICBRAINZ_FORCE_DISC_ID */
+ if (disc->track_count > 0 &&
+ mb5_track_list_size (tracks) != disc->track_count)
+ continue;
+
+ res = match_discid (disc, medium);
+ if (res < 0) {
+ /*
+ * Discid collision, clear fuzzy matches and only allow an
+ * (unlikely) exact match from another medium
+ */
+ if (best_sum != 0) {
+ best_sum = 0;
+ matches = NULL;
+ }
+ g_info ("Discid collision: disc %2d", i + 1);
+ continue;
+ } else if (res > 0) {
+ /* Exact match, clear any fuzzy matches */
+ sum = 0;
+ g_info ("Exact match: disc %2d", i + 1);
+ } else {
+ /*
+ * Fuzzy match, skip if we're only interested in exact matches
+ * or MUSICBRAINZ_FORCE_DISC_ID is set
+ */
+ if (best_sum == 0 || disc->track_count == 0)
+ continue;
+
+ if (!g_uint_checked_add (&sum, track_delta_sum (disc, tracks), 1))
+ continue;
+
+ g_info ("Fuzzy Match: disc: %2d, sum: %8u", i + 1, sum);
+ }
+
+ if (sum < best_sum) {
+ matches = g_slist_prepend (NULL, medium);
+ best_sum = sum;
+ } else if (sum == best_sum) {
+ matches = g_slist_prepend (matches, medium);
+ }
+ }
+ return matches;
+}
+
/*
* Virtual methods
*/
@@ -1146,7 +1344,7 @@ mb5_list_albums (SjMetadata *metadata,
if (disc == NULL)
return NULL;
- for (i = 0; i < disc->count; i++) {
+ for (i = 0; i < disc->release_count; i++) {
AlbumDetails *album;
Mb5Release full_release = NULL;
Mb5Metadata release_md = NULL;
@@ -1165,10 +1363,8 @@ artist-rels";
if (release_md && mb5_metadata_get_release (release_md))
full_release = mb5_metadata_get_release (release_md);
if (full_release) {
- Mb5MediumList media;
Mb5Metadata group_md = NULL;
Mb5ReleaseGroup group;
- int j;
group = mb5_release_get_releasegroup (full_release);
if (group) {
@@ -1177,6 +1373,7 @@ artist-rels";
* lookup_release query doesn't have the url relations for the
* release-group, so run a separate query to get these urls
*/
+ GSList *media, *medium;
char *releasegroupid = NULL;
const char *group_includes = "artists url-rels";
@@ -1191,26 +1388,21 @@ artist-rels";
if (group_md && mb5_metadata_get_releasegroup (group_md))
group = mb5_metadata_get_releasegroup (group_md);
- media = mb5_release_media_matching_discid (full_release, disc->id);
- for (j = 0; j < mb5_medium_list_size (media); j++) {
- Mb5Medium medium;
-
- medium = mb5_medium_list_item (media, j);
- if (medium) {
- album = make_album_from_release (self, group, full_release, medium, cancellable, error);
- if (*error != NULL) {
- mb5_metadata_delete (group_md);
- mb5_medium_list_delete (media);
- mb5_metadata_delete (release_md);
- goto free_releases;
- }
-
- album->metadata_source = SOURCE_MUSICBRAINZ;
- albums = g_list_append (albums, album);
+ media = get_matching_media (disc, full_release);
+ for (medium = media; medium; medium = medium->next) {
+ album = make_album_from_release (self, group, full_release, medium->data, cancellable, error);
+ if (*error != NULL) {
+ mb5_metadata_delete (group_md);
+ g_slist_free (media);
+ mb5_metadata_delete (release_md);
+ goto free_releases;
}
+
+ album->metadata_source = SOURCE_MUSICBRAINZ;
+ albums = g_list_prepend (albums, album);
}
mb5_metadata_delete (group_md);
- mb5_medium_list_delete (media);
+ g_slist_free (media);
}
mb5_metadata_delete (release_md);
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]