[odrs-web] Deduplicate the app_id values in the reviews table



commit 5e0b9b5077f9eef696ebc70721d501d159aa53fc
Author: Richard Hughes <richard hughsie com>
Date:   Thu Jul 4 11:27:30 2019 +0100

    Deduplicate the app_id values in the reviews table
    
    This will allow us to cache some of the slow-to-generate stats counters and
    also further deduplicate the list of IDs.

 app_data/migrations/versions/e6fa15874247_.py | 63 +++++++++++++++++++++++++++
 app_data/odrs/models.py                       | 28 ++++++++++--
 app_data/odrs/templates/components.html       | 29 ++++++++++++
 app_data/odrs/templates/default.html          |  1 +
 app_data/odrs/templates/show-all.html         |  2 +-
 app_data/odrs/templates/show.html             |  4 +-
 app_data/odrs/views_admin.py                  | 30 ++++++++++---
 app_data/odrs/views_api.py                    | 45 ++++++++++++++-----
 8 files changed, 177 insertions(+), 25 deletions(-)
---
diff --git a/app_data/migrations/versions/e6fa15874247_.py b/app_data/migrations/versions/e6fa15874247_.py
new file mode 100644
index 0000000..3e1c316
--- /dev/null
+++ b/app_data/migrations/versions/e6fa15874247_.py
@@ -0,0 +1,63 @@
+"""
+
+Revision ID: e6fa15874247
+Revises: b2d75ba212ed
+Create Date: 2019-07-04 10:44:23.739416
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = 'e6fa15874247'
+down_revision = 'b2d75ba212ed'
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import mysql
+from sqlalchemy.exc import InternalError
+
+from odrs import db
+from odrs.models import Review, Component
+
+def upgrade():
+
+    try:
+        op.create_table('components',
+        sa.Column('component_id', sa.Integer(), nullable=False),
+        sa.Column('app_id', sa.Text(), nullable=True),
+        sa.Column('review_cnt', sa.Integer(), nullable=True),
+        sa.PrimaryKeyConstraint('component_id'),
+        sa.UniqueConstraint('component_id'),
+        mysql_character_set='utf8mb4'
+        )
+        op.add_column('reviews', sa.Column('component_id', sa.Integer(), nullable=False))
+        op.create_foreign_key(None, 'reviews', 'components', ['component_id'], ['component_id'])
+    except InternalError as e:
+        print(str(e))
+
+    # get existing components
+    app_ids = {}
+    for component in db.session.query(Component).all():
+        app_ids[component.app_id] = component
+
+    # add any extra we need, incrementing the count otherwise
+    reviews = db.session.query(Review).all()
+    for review in reviews:
+        if review._app_id not in app_ids:
+            print('adding', review._app_id)
+            component = Component(review._app_id)
+            db.session.add(component)
+            app_ids[component.app_id] = component
+        else:
+            component = app_ids[review._app_id]
+            component.review_cnt += 1
+    db.session.commit()
+
+    # fix up the component_id on the existing reviews
+    for review in reviews:
+        review.component_id = app_ids[review._app_id].component_id
+    db.session.commit()
+
+def downgrade():
+    #op.drop_constraint(None, 'reviews', type_='foreignkey')
+    op.drop_column('reviews', 'component_id')
+    op.drop_table('components')
diff --git a/app_data/odrs/models.py b/app_data/odrs/models.py
index 0926a72..e2fa43f 100644
--- a/app_data/odrs/models.py
+++ b/app_data/odrs/models.py
@@ -134,6 +134,25 @@ class User(db.Model):
 def _tokenize(val):
     return [token.lower() for token in re.findall(r"[\w']+", val)]
 
+class Component(db.Model):
+
+    # sqlalchemy metadata
+    __tablename__ = 'components'
+    __table_args__ = {'mysql_character_set': 'utf8mb4'}
+
+    component_id = Column(Integer, primary_key=True, nullable=False, unique=True)
+    app_id = Column(Text)
+    review_cnt = Column(Integer, default=1)
+
+    reviews = relationship('Review', back_populates='component')
+
+    def __init__(self, app_id):
+        self.app_id = app_id
+        self.review_cnt = 1
+
+    def __repr__(self):
+        return 'Component object %s' % self.component_id
+
 class Review(db.Model):
 
     # sqlalchemy metadata
@@ -143,7 +162,8 @@ class Review(db.Model):
     review_id = Column(Integer, primary_key=True, nullable=False, unique=True)
     date_created = Column(DateTime, nullable=False, default=datetime.datetime.utcnow)
     date_deleted = Column(DateTime)
-    app_id = Column(Text)
+    _app_id = Column('app_id', Text)
+    component_id = Column(Integer, ForeignKey('components.component_id'), nullable=False)
     locale = Column(Text)
     summary = Column(Text)
     description = Column(Text)
@@ -158,12 +178,12 @@ class Review(db.Model):
     reported = Column(Integer, default=0)
 
     user = relationship('User', back_populates='reviews')
+    component = relationship('Component', back_populates='reviews', lazy='joined')
     votes = relationship('Vote',
                          back_populates='review',
                          cascade='all,delete-orphan')
 
     def __init__(self):
-        self.app_id = None
         self.locale = None
         self.summary = None
         self.description = None
@@ -213,7 +233,7 @@ class Review(db.Model):
 
     def asdict(self, user_hash=None):
         item = {
-            'app_id': self.app_id,
+            'app_id': self.component.app_id,
             'date_created': self.date_created.timestamp(),
             'description': self.description,
             'distro': self.distro,
@@ -229,7 +249,7 @@ class Review(db.Model):
             'version': self.version,
         }
         if user_hash:
-            item['user_skey'] = _get_user_key(user_hash, self.app_id)
+            item['user_skey'] = _get_user_key(user_hash, self.component.app_id)
         return item
 
     def __repr__(self):
diff --git a/app_data/odrs/templates/components.html b/app_data/odrs/templates/components.html
new file mode 100644
index 0000000..b299713
--- /dev/null
+++ b/app_data/odrs/templates/components.html
@@ -0,0 +1,29 @@
+{% extends "default.html" %}
+{% block title %}Components{% endblock %}
+
+{% block content %}
+
+<h2>Components</h2>
+
+{% if components|length == 0 %}
+<p>
+  There are no components stored.
+</p>
+{% else %}
+<table class="table table-hover table-responsive">
+  <tr class="row">
+    <th class="col-sm-1">AppStream ID</th>
+    <th class="col-sm-2">Review Count</th>
+    <th class="col-sm-2">&nbsp;</th>
+  </tr>
+{% for component in components %}
+  <tr class="row">
+    <td>{{component.app_id}}</td>
+    <td>{{component.review_cnt}}</td>
+    <td>&nbsp;</td>
+  </tr>
+{% endfor %}
+</table>
+{% endif %}
+
+{% endblock %}
diff --git a/app_data/odrs/templates/default.html b/app_data/odrs/templates/default.html
index 087a9b6..4e86e92 100644
--- a/app_data/odrs/templates/default.html
+++ b/app_data/odrs/templates/default.html
@@ -40,6 +40,7 @@
             <li><a href="{{url_for('.admin_users_all')}}">Users</a></li>
             <li><a href="{{url_for('.admin_moderator_show_all')}}">Moderators</a></li>
             <li><a href="{{url_for('.admin_taboo_show_all')}}">Taboos</a></li>
+            <li><a href="{{url_for('.admin_component_show_all')}}">Components</a></li>
             <li><a href="{{url_for('.admin_distros')}}">Distributions</a></li>
             <li><a href="{{url_for('.admin_graph_month')}}">Usage</a></li>
             <li><a href="{{url_for('.admin_search')}}">Search</a></li>
diff --git a/app_data/odrs/templates/show-all.html b/app_data/odrs/templates/show-all.html
index 19059fc..b2dde95 100644
--- a/app_data/odrs/templates/show-all.html
+++ b/app_data/odrs/templates/show-all.html
@@ -21,7 +21,7 @@
 
 {% for r in reviews %}
   <tr class="row">
-    <td>{{r.app_id.replace('.desktop', '').replace('.Application', '')}}</td>
+    <td>{{r.component.app_id}}</td>
     <td>{{r.version}}</td>
     <td>
       {{format_rating(r.rating)}}
diff --git a/app_data/odrs/templates/show.html b/app_data/odrs/templates/show.html
index 7566232..54af798 100644
--- a/app_data/odrs/templates/show.html
+++ b/app_data/odrs/templates/show.html
@@ -8,8 +8,8 @@
 <div class="card">
   <div class="card-body">
     <h1 class="card-title">
-      {{r.app_id}}
-      <a class="btn pull-right" href="{{url_for('.admin_show_app', app_id=r.app_id)}}">All</a>
+      {{r.component.app_id}}
+      <a class="btn pull-right" href="{{url_for('.admin_show_app', app_id=r.component.app_id)}}">All</a>
     </h1>
 {% for taboo in matched_taboos %}
     <div class="alert alert-{{taboo.color}}" role="alert">
diff --git a/app_data/odrs/views_admin.py b/app_data/odrs/views_admin.py
index 8dd6418..46c1268 100644
--- a/app_data/odrs/views_admin.py
+++ b/app_data/odrs/views_admin.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 #
-# pylint: disable=invalid-name,missing-docstring,chained-comparison
+# pylint: disable=invalid-name,missing-docstring,chained-comparison,singleton-comparison
 #
 # Copyright (C) 2015-2019 Richard Hughes <richard hughsie com>
 #
@@ -17,7 +17,7 @@ from flask import abort, request, flash, render_template, redirect, url_for
 from flask_login import login_required, current_user
 
 from odrs import app, db
-from .models import Review, User, Moderator, Vote, Taboo
+from .models import Review, User, Moderator, Vote, Taboo, Component
 from .models import _vote_exists
 from .util import _get_datestr_from_dt, _get_taboos_for_locale
 
@@ -208,7 +208,7 @@ def admin_show_stats():
     stats['Unique distros'] = rs.fetchone()[0]
 
     # unique apps
-    rs = db.session.execute("SELECT COUNT(DISTINCT(app_id)) FROM reviews;") # pylint: disable=no-member
+    rs = db.session.execute("SELECT COUNT(*) FROM components;") # pylint: disable=no-member
     stats['Unique apps reviewed'] = rs.fetchone()[0]
 
     # unique distros
@@ -226,10 +226,11 @@ def admin_show_stats():
                                 "GROUP BY app_id ORDER BY total DESC LIMIT 50;")
 
     # popularity reviews
-    submitted = db.session.execute("SELECT DISTINCT app_id, COUNT(app_id) as total " # pylint: 
disable=no-member
-                                   "FROM eventlog WHERE app_id IS NOT NULL "
-                                   "AND message='reviewed' GROUP BY app_id "
-                                   "ORDER BY total DESC LIMIT 50;")
+    submitted = db.session.query(Component.app_id, Component.review_cnt).\
+                                    filter(Component.app_id != None).\
+                                    order_by(Component.review_cnt.desc()).\
+                                    limit(50).all()
+
     return render_template('stats.html',
                            results_stats=stats,
                            results_viewed=viewed,
@@ -698,6 +699,21 @@ def admin_taboo_delete(taboo_id):
     flash('Deleted taboo')
     return redirect(url_for('.admin_taboo_show_all'))
 
+@app.route('/admin/component/all')
+@login_required
+def admin_component_show_all():
+    """
+    Return all the components.
+    """
+    # security check
+    if not current_user.is_admin:
+        flash('Unable to show all components', 'error')
+        return redirect(url_for('.odrs_index'))
+    components = db.session.query(Component).\
+                order_by(Component.app_id.asc()).\
+                order_by(Component.review_cnt.asc()).all()
+    return render_template('components.html', components=components)
+
 @app.route('/admin/vote/<review_id>/<val_str>')
 @login_required
 def admin_vote(review_id, val_str):
diff --git a/app_data/odrs/views_api.py b/app_data/odrs/views_api.py
index ad511cf..5ec19f1 100644
--- a/app_data/odrs/views_api.py
+++ b/app_data/odrs/views_api.py
@@ -20,7 +20,7 @@ from flask import request, Response
 
 from odrs import app, db
 
-from .models import Review, User, Vote, Analytic, Taboo
+from .models import Review, User, Vote, Analytic, Taboo, Component
 from .models import _vote_exists
 from .util import json_success, json_error, _locale_is_compatible, _eventlog_add, _get_user_key, 
_get_datestr_from_dt
 from .util import _sanitised_version, _sanitised_summary, _sanitised_description, _get_rating_for_app_id
@@ -104,7 +104,8 @@ def api_submit():
 
     # user has already reviewed
     if db.session.query(Review).\
-            filter(Review.app_id == item['app_id']).\
+            join(Component).\
+            filter(Component.app_id == item['app_id']).\
             filter(Review.user_id == user.user_id).first():
         _eventlog_add(_get_client_address(),
                       user.user_id,
@@ -112,9 +113,26 @@ def api_submit():
                       'already reviewed')
         return json_error('already reviewed this app')
 
+    # this is basically a clunky UPSERT that works with MySQL
+    stmt = insert(Component).values(app_id=item['app_id'])
+    if db.session.bind.dialect.name != 'sqlite': # pylint: disable=no-member
+        stmt_ondupe = stmt.on_duplicate_key_update(review_cnt=Component.review_cnt + 1)
+    else:
+        stmt_ondupe = stmt
+    try:
+        db.session.execute(stmt_ondupe) # pylint: disable=no-member
+        db.session.commit()
+    except IntegrityError as e:
+        print('ignoring: {}'.format(str(e)))
+
+    # component definately exists now!
+    component = db.session.query(Component).filter(Component.app_id == item['app_id']).first()
+    if not component:
+        return json_error('cannot create component for {}'.format(item['app_id']))
+
     # create new
     review = Review()
-    review.app_id = item['app_id']
+    review._app_id = item['app_id'] # pylint: disable=protected-access
     review.locale = item['locale']
     review.summary = _sanitised_summary(item['summary'])
     review.description = _sanitised_description(item['description'])
@@ -123,6 +141,7 @@ def api_submit():
     review.distro = item['distro']
     review.rating = item['rating']
     review.user_addr = _get_client_address()
+    review.component_id = component.component_id
 
     # check if valid
     user_display_ignore = ['root',
@@ -140,7 +159,7 @@ def api_submit():
     # log and add
     _eventlog_add(_get_client_address(),
                   review.user_id,
-                  review.app_id,
+                  component.app_id,
                   'reviewed')
     db.session.add(review)
     db.session.commit()
@@ -153,7 +172,8 @@ def api_show_app(app_id, user_hash=None):
     Return details about an application.
     """
     reviews = db.session.query(Review).\
-                    filter(Review.app_id == app_id).\
+                    join(Component).\
+                    filter(Component.app_id == app_id).\
                     filter(Review.reported < ODRS_REPORTED_CNT).\
                     order_by(Review.date_created.desc()).all()
     items = [review.asdict(user_hash) for review in reviews]
@@ -199,7 +219,8 @@ def api_fetch():
     if 'compat_ids' in item:
         app_ids.extend(item['compat_ids'])
     reviews = db.session.query(Review).\
-                    filter(Review.app_id.in_(app_ids)).\
+                    join(Component).\
+                    filter(Component.app_id.in_(app_ids)).\
                     filter(Review.reported < ODRS_REPORTED_CNT).all()
 
     # if user does not exist then create
@@ -334,7 +355,9 @@ def _vote(val):
     # update the per-user karma
     user.karma += val
 
-    review = db.session.query(Review).filter(Review.app_id == item['app_id']).first()
+    review = db.session.query(Review).\
+                join(Component).\
+                filter(Component.app_id == item['app_id']).first()
     if not review:
         _eventlog_add(_get_client_address(), user.user_id, None,
                       'invalid review ID of %s' % item['app_id'], important=True)
@@ -416,7 +439,7 @@ def api_remove():
                 filter(Review.user_id == user.user_id).first()
     if not review:
         return json_error('no review')
-    if review.app_id != item['app_id']:
+    if review.component.app_id != item['app_id']:
         return json_error('the app_id is invalid')
 
     if item['user_skey'] != _get_user_key(item['user_hash'], item['app_id']):
@@ -464,9 +487,9 @@ def api_ratings():
     Get the star ratings for all known applications.
     """
     item = {}
-    app_ids = [res[0] for res in db.session.query(Review.app_id).\
-                                       order_by(Review.app_id.asc()).\
-                                       distinct(Review.app_id).all()]
+    app_ids = [res[0] for res in db.session.query(Component.app_id).\
+                                       order_by(Component.app_id.asc()).\
+                                       distinct(Component.app_id).all()]
     for app_id in app_ids:
         ratings = _get_rating_for_app_id(app_id, 2)
         if len(ratings) == 0:


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]