[gnome-music/wip/jfelder/songeditor-gtk4: 9/10] tagsimilarity: Add logic to compute the similarity between songs




commit 98152a45201365ff2cf34d49022c14e0a40e9e03
Author: Jean Felder <jfelder src gnome org>
Date:   Wed May 20 01:49:14 2020 +0200

    tagsimilarity: Add logic to compute the similarity between songs
    
    The main function is song_similarity. It checks the main tags of a
    song to compute a similarity score based on the similarity score of
    each of the tags. Some weights are used to give more importance to the
    most important tags.
    
    There are three different similarity algorithms:
     - similarity between two strings
     - similarity between two numbers
     - similarity between two dates

 gnomemusic/tagsimilarity.py | 180 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
---
diff --git a/gnomemusic/tagsimilarity.py b/gnomemusic/tagsimilarity.py
new file mode 100644
index 000000000..e511f8b79
--- /dev/null
+++ b/gnomemusic/tagsimilarity.py
@@ -0,0 +1,180 @@
+# Copyright 2022 The GNOME Music developers
+#
+# GNOME Music is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# GNOME Music is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with GNOME Music; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# The GNOME Music authors hereby grant permission for non-GPL compatible
+# GStreamer plugins to be used and distributed together with GStreamer
+# and GNOME Music.  This permission is above and beyond the permissions
+# granted by the GPL license by which GNOME Music is covered.  If you
+# modify this code, you may extend this exception to your version of the
+# code, but you are not obligated to do so.  If you do not wish to do so,
+# delete this exception statement from your version.
+
+import re
+from difflib import SequenceMatcher
+
+from gi.repository import GLib, Grl
+
+WEIGHTS = {
+    "title": 25,
+    "album": 15,
+    "artist": 10,
+    "date": 8,
+    "album-artist": 6,
+    "track-number": 4,
+    "album-disc-number": 4
+}
+
+
+def string_similarity(str_a: str, str_b: str) -> float:
+    """Compute the similarity between two strings.
+
+    All the non word characters from the strings are removed
+    to prevent wrong results.
+
+    1 means that the strings are identical.
+    0 means that the strings are completely different.
+
+    :param str str_a: first string
+    :param str str_b: second string
+    :returns: similarity score between 0 and 1
+    :rtype: float
+    """
+    words_regex = re.compile(r"\w+")
+    words_a = re.findall(words_regex, str_a.lower())
+    words_b = re.findall(words_regex, str_b.lower())
+
+    return SequenceMatcher(None, words_a, words_b).ratio()
+
+
+def number_similarity(nr_a: int, nr_b: int) -> float:
+    """Compute the similarity between two numbers.
+
+    1 means that the numbers are identical.
+    0 means that the numbers are different.
+
+    :param int nr_a: first number
+    :param int nr_b: second number
+    :returns: similarity score between 0 and 1
+    :rtype: float
+    """
+    if nr_a == nr_b:
+        return 1.0
+
+    return 0.0
+
+
+def date_similarity(date_a: GLib.DateTime, date_b: GLib.DateTime) -> float:
+    """Compute the similarity between two dates.
+
+    1 means are year and month are indentical.
+    0.95 means that the years are indentical.
+    0.65 means that the years difference is lower than 3 years.
+    0.25 means that the years difference is greater than 3 years.
+    0 means that at least one of the date is missing.
+
+    :param GLib.DateTime date_a: first date
+    :param GLib.DateTime date_b: second date
+    :returns: similarity score between 0 and 1
+    :rtype: float
+    """
+    if (not date_a
+            or not date_b):
+        return 0.0
+
+    year_a = date_a.get_year()
+    year_b = date_b.get_year()
+    if (not year_a
+            or not year_b):
+        return 0.0
+
+    month_a = date_a.get_month()
+    month_b = date_b.get_month()
+    if year_a == year_b:
+        if month_a == month_b:
+            return 1.0
+        return 0.95
+
+    if abs(year_a - year_b) < 3:
+        return 0.65
+
+    return 0.25
+
+
+def song_similarity(media_ref: Grl.Media, media_cmp: Grl.Media) -> float:
+    """Compute a similarity score between two audio medias.
+
+    It checks the main tags of a song to compute a similarity
+    score based on the similarity score of each of the tags. Some
+    weights are used to give more importance to the most important
+    tags.
+    media_ref is used as a reference media. It means that a tag
+    similarity score won't be computed if it's not available in
+    media_ref.
+
+    A high score means a good similarity.
+
+    :param Grl.Media media_ref: The reference media
+    :param Grl.Media media_cmp: The compared media
+    :returns: a similarity score between media_a and media_b
+    :rtype: float
+    """
+    score = 0.0
+
+    if media_ref.get_title():
+        new_title = (media_cmp.get_title()
+                     or "")
+        title_score = string_similarity(media_ref.get_title(), new_title)
+        score += title_score * WEIGHTS["title"]
+
+    if media_ref.get_album():
+        new_album = (media_cmp.get_album()
+                     or "")
+        album_score = string_similarity(media_ref.get_album(), new_album)
+        score += album_score * WEIGHTS["album"]
+
+    if media_ref.get_artist():
+        new_artist = (media_cmp.get_artist()
+                      or "")
+        artist_score = string_similarity(media_ref.get_artist(), new_artist)
+        score += artist_score * WEIGHTS["artist"]
+
+    if media_ref.get_album_artist():
+        new_album_artist = (media_cmp.get_album_artist()
+                            or "")
+        album_artist_score = string_similarity(
+            media_ref.get_album_artist(), new_album_artist)
+        score += album_artist_score * WEIGHTS["album-artist"]
+
+    if media_ref.get_track_number():
+        new_track_nr = (media_cmp.get_track_number()
+                        or 0)
+        track_nr_score = number_similarity(
+            media_ref.get_track_number(), new_track_nr)
+        score += track_nr_score * WEIGHTS["track-number"]
+
+    if media_ref.get_album_disc_number():
+        new_album_disc_nr = (media_cmp.get_album_disc_number()
+                             or 0)
+        album_disc_nr_score = number_similarity(
+            media_ref.get_album_disc_number(), new_album_disc_nr)
+        score += album_disc_nr_score * WEIGHTS["album-disc-number"]
+
+    if media_ref.get_creation_date():
+        date_score = date_similarity(
+            media_ref.get_creation_date(), media_cmp.get_creation_date())
+        score += date_score * WEIGHTS["date"]
+
+    return score


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]