[geary/wip/714134-gc] A great first start that seems to work well.



commit 92d07aacbc8edb8315a1382bced7ed3e8948e72f
Author: Jim Nelson <jim yorba org>
Date:   Wed Dec 17 17:41:21 2014 -0800

    A great first start that seems to work well.
    
    Note: This commit will upgrade your database and delete a lot of email
    and attachments from your local disk.  Test with caution!  Backup your
    mail directory!

 sql/CMakeLists.txt                       |    1 +
 sql/version-024.sql                      |   11 +
 src/CMakeLists.txt                       |    1 +
 src/engine/imap-db/imap-db-database.vala |   24 ++
 src/engine/imap-db/imap-db-gc.vala       |  398 ++++++++++++++++++++++++++++++
 5 files changed, 435 insertions(+), 0 deletions(-)
---
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index 40184ce..11de86c 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -23,3 +23,4 @@ install(FILES version-020.sql DESTINATION ${SQL_DEST})
 install(FILES version-021.sql DESTINATION ${SQL_DEST})
 install(FILES version-022.sql DESTINATION ${SQL_DEST})
 install(FILES version-023.sql DESTINATION ${SQL_DEST})
+install(FILES version-024.sql DESTINATION ${SQL_DEST})
diff --git a/sql/version-024.sql b/sql/version-024.sql
new file mode 100644
index 0000000..4925784
--- /dev/null
+++ b/sql/version-024.sql
@@ -0,0 +1,11 @@
+--
+-- Add the DeleteAttachmentFile table, which allows for attachment files to be deleted (garbage
+-- collected) after all references to them have been removed from the database without worrying
+-- about deleting them first and the database transaction failing.
+--
+
+CREATE TABLE DeleteAttachmentFileTable (
+    id INTEGER PRIMARY KEY,
+    filename TEXT NOT NULL
+)
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index dbd4d98..251b3b0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -168,6 +168,7 @@ engine/imap-db/imap-db-contact.vala
 engine/imap-db/imap-db-database.vala
 engine/imap-db/imap-db-email-identifier.vala
 engine/imap-db/imap-db-folder.vala
+engine/imap-db/imap-db-gc.vala
 engine/imap-db/imap-db-message-addresses.vala
 engine/imap-db/imap-db-message-row.vala
 engine/imap-db/imap-db-search-query.vala
diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala
index 533209f..2459313 100644
--- a/src/engine/imap-db/imap-db-database.vala
+++ b/src/engine/imap-db/imap-db-database.vala
@@ -13,6 +13,8 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
     private ProgressMonitor upgrade_monitor;
     private string account_owner_email;
     private bool new_db = false;
+    private GC? gc = null;
+    private Cancellable gc_cancellable = new Cancellable();
     
     public Database(File db_dir, File schema_dir, ProgressMonitor upgrade_monitor,
         string account_owner_email) {
@@ -35,6 +37,28 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
     public new void open(Db.DatabaseFlags flags, Cancellable? cancellable) throws Error {
         open_background(flags, on_prepare_database_connection, pump_event_loop,
             OPEN_PUMP_EVENT_LOOP_MSEC, cancellable);
+        
+        gc = new GC(this, Priority.LOW);
+        gc.run_async.begin(gc_cancellable, on_gc_run_async_completed);
+    }
+    
+    private void on_gc_run_async_completed(Object? object, AsyncResult result) {
+        try {
+            gc.run_async.end(result);
+        } catch (Error err) {
+            debug("Garbage collection of IMAP database %s completed with error: %s",
+                db_file.get_path(), err.message);
+        }
+        
+        // Drop ref to avoid cyclical references
+        gc = null;
+    }
+    
+    public override void close(Cancellable? cancellable) throws Error {
+        gc_cancellable.cancel();
+        gc_cancellable = new Cancellable();
+        
+        base.close(cancellable);
     }
     
     private void pump_event_loop() {
diff --git a/src/engine/imap-db/imap-db-gc.vala b/src/engine/imap-db/imap-db-gc.vala
new file mode 100644
index 0000000..f2245ce
--- /dev/null
+++ b/src/engine/imap-db/imap-db-gc.vala
@@ -0,0 +1,398 @@
+/* Copyright 2014 Yorba Foundation
+ *
+ * This software is licensed under the GNU Lesser General Public License
+ * (version 2.1 or later).  See the COPYING file in this distribution.
+ */
+
+/**
+ * IMAP database garbage collector.
+ *
+ * Currently the garbage collector reaps messages unlinked from the MessageLocationTable older than
+ * a prescribed date.  It also removes their on-disk attachment files (in a transaction-safe manner)
+ * and looks for empty directories in the attachment directory tree (caused by attachment files
+ * being removed without deleting their parents).
+ *
+ * The garbage collector is designed to run in the background and in such a way that it can be
+ * closed (even by application shutdown) and re-run later without the database going incoherent.
+ */
+
+private class Geary.ImapDB.GC {
+    // Days old from today an unlinked email message must be to be reaped by the garbage collector
+    private const int UNLINKED_DAYS = 31;
+    
+    // Amount of time to sleep between various GC iterations to give other operations a chance
+    private const uint SLEEP_MSEC = 50;
+    
+    // Number of files to delete from the DeleteAttachmentFileTable per iteration
+    private const int DELETE_ATTACHMENT_PER = 5;
+    
+    // Number of files to enumerate per time when walking a directory's children
+    private const int ENUM_DIR_PER = 10;
+    
+    /**
+     * Indicates the garbage collector is running.
+     *
+     * { link run_async} will return immediately if called while running.
+     */
+    public bool is_running { get; private set; default = false; }
+    
+    private ImapDB.Database db;
+    private int priority;
+    private File data_dir;
+    
+    public GC(ImapDB.Database db, int priority) {
+        this.db = db;
+        this.priority = priority;
+        data_dir = db.db_file.get_parent();
+    }
+    
+    /**
+     * Should only be called from the foreground thread.
+     */
+    public async void run_async(Cancellable? cancellable) throws Error {
+        if (is_running)
+            return;
+        
+        is_running = true;
+        try {
+            debug("[%s] Starting garbage collection of IMAP database", to_string());
+            yield internal_run_async(cancellable);
+            debug("[%s] Completed garbage collection of IMAP database", to_string());
+        } finally {
+            is_running = false;
+        }
+    }
+    
+    private async void internal_run_async(Cancellable? cancellable) throws Error {
+        DateTime now = new DateTime.now(new TimeZone.local());
+        DateTime reap_date = now.add_days(0 - UNLINKED_DAYS);
+        
+        debug("[%s] Garbage collector reaping date: %s (%s)", to_string(), reap_date.to_string(),
+            reap_date.to_unix().to_string());
+        
+        //
+        // Find all messages unlinked from the location table and older than the GC epoch ... this
+        // is necessary because we can't be certain that the local store is fully synchronized
+        // with the server; it's possible we recvd a message in the Inbox, the user archived it,
+        // then closed Geary before the engine could synchronize will All Mail.  In that
+        // situation, the email is completely unlinked from the location table but still on the
+        // server.  This attempts to give some "breathing room".  If the message is gc'd and
+        // detected later, the engine will merely re-download it.  As long as the gc'd emails are
+        // not in the MessageLocationTable, removing them will leave the db in a coherent state.
+        //
+        // Checking internaldate_time_t is NULL is merely a way to gc emails that were allocated
+        // a row in the database but never downloaded.  Since internaldate is the first thing
+        // downloaded, this is rare, but can happen, and this will reap those rows.
+        //
+        
+        Gee.HashSet<int64?> gc_message_ids = new Gee.HashSet<int64?>(Collection.int64_hash_func,
+            Collection.int64_equal_func);
+        
+        yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => {
+            Db.Statement stmt = cx.prepare("""
+                SELECT id
+                FROM MessageTable
+                WHERE (internaldate_time_t IS NULL OR internaldate_time_t <= ?)
+                AND NOT EXISTS (
+                    SELECT message_id
+                    FROM MessageLocationTable
+                    WHERE MessageLocationTable.message_id = MessageTable.id
+                )
+            """);
+            stmt.bind_int64(0, reap_date.to_unix());
+            
+            Db.Result result = stmt.exec(cancellable);
+            while (!result.finished) {
+                gc_message_ids.add(result.rowid_at(0));
+                
+                result.next(cancellable);
+            }
+            
+            return Db.TransactionOutcome.DONE;
+        }, cancellable);
+        
+        message("[%s] Found %d email messages ready for reaping", to_string(), gc_message_ids.size);
+        
+        //
+        // To prevent holding the database lock for long periods of time, delete each message one
+        // at a time, deleting it from subsidiary tables as well as all on-disk attachments.
+        // Although slow, we do want this to be a background task that doesn't interrupt the user.
+        // This approach also means gc can be interrupted at any time (i.e. the user exits the
+        // application) without leaving the database in an incoherent state.  gc can be resumed
+        // even if interrupted.
+        //
+        
+        int count = 0;
+        foreach (int64 message_id in gc_message_ids) {
+            try {
+                yield reap_message_async(message_id, cancellable);
+                count++;
+            } catch (Error err) {
+                if (err is IOError.CANCELLED)
+                    throw err;
+                
+                message("[%s] Unable to reap message #%s: %s", to_string(), message_id.to_string(),
+                    err.message);
+            }
+            
+            yield Scheduler.sleep_ms_async(SLEEP_MSEC);
+        }
+        
+        message("[%s] Reaped %d email messages", to_string(), count);
+        
+        //
+        // Now delete attachment files marked for deletion ... since they're added to this table
+        // as part of the gc_message_async() transaction, assured that they're ready for deletion
+        // (and, again, means this process is resumable)
+        //
+        
+        count = 0;
+        for (;;) {
+            int deleted = yield delete_attachment_files(DELETE_ATTACHMENT_PER, cancellable);
+            if (deleted == 0)
+                break;
+            
+            count += deleted;
+            
+            yield Scheduler.sleep_ms_async(SLEEP_MSEC);
+        }
+        
+        message("[%s] Deleted %d attachment files from reaped messages", to_string(), count);
+        
+        //
+        // To be sure everything's clean, delete any empty directories in the attachment dir tree,
+        // as old code would only remove files
+        //
+        
+        count = yield delete_empty_attachment_directories_async(null, null, cancellable);
+        
+        message("[%s] Deleted %d empty attachment directories", to_string(), count);
+    }
+    
+    private async void reap_message_async(int64 message_id, Cancellable? cancellable) throws Error {
+        yield db.exec_transaction_async(Db.TransactionType.RW, (cx) => {
+            // Since there's a window of time between locating gc-able messages and removing them,
+            // need to double-check in the transaction that it's still not in the MessageLocationTable.
+            Db.Statement stmt = cx.prepare("""
+                SELECT id
+                FROM MessageLocationTable
+                WHERE message_id = ?
+            """);
+            stmt.bind_rowid(0, message_id);
+            
+            // If find one, then message is no longer unlinked
+            Db.Result result = stmt.exec(cancellable);
+            if (!result.finished) {
+                debug("[%s] Not garbage collection message #%s: found linked in location table",
+                    to_string(), message_id.to_string());
+                
+                return Db.TransactionOutcome.ROLLBACK;
+            }
+            
+            //
+            // Fetch all on-disk attachments for this message
+            //
+            
+            Gee.ArrayList<File> attachment_files = new Gee.ArrayList<File>();
+            
+            stmt = cx.prepare("""
+                SELECT id, filename
+                FROM MessageAttachmentTable
+                WHERE message_id = ?
+            """);
+            stmt.bind_rowid(0, message_id);
+            
+            result = stmt.exec(cancellable);
+            while (!result.finished) {
+                File file = Attachment.generate_file(data_dir, message_id, result.rowid_for("id"),
+                    result.string_for("filename"));
+                attachment_files.add(file);
+                
+                result.next(cancellable);
+            }
+            
+            //
+            // Delete from search table
+            //
+            
+            stmt = cx.prepare("""
+                DELETE FROM MessageSearchTable
+                WHERE docid = ?
+            """);
+            stmt.bind_rowid(0, message_id);
+            
+            stmt.exec(cancellable);
+            
+            //
+            // Delete from attachment table
+            //
+            
+            stmt = cx.prepare("""
+                DELETE FROM MessageAttachmentTable
+                WHERE message_id = ?
+            """);
+            stmt.bind_rowid(0, message_id);
+            
+            stmt.exec(cancellable);
+            
+            //
+            // Delete from message table
+            //
+            
+            stmt = cx.prepare("""
+                DELETE FROM MessageTable
+                WHERE id = ?
+            """);
+            stmt.bind_rowid(0, message_id);
+            
+            stmt.exec(cancellable);
+            
+            //
+            // Mark on-disk attachment files as ready for deletion
+            //
+            
+            foreach (File attachment_file in attachment_files) {
+                stmt = cx.prepare("""
+                    INSERT INTO DeleteAttachmentFileTable (filename)
+                    VALUES (?)
+                """);
+                stmt.bind_string(0, attachment_file.get_path());
+                
+                stmt.exec(cancellable);
+            }
+            
+            //
+            // Done; other than on-disk attachment files, message is now garbage collected.
+            //
+            
+            return Db.TransactionOutcome.COMMIT;
+        }, cancellable);
+    }
+    
+    private async int delete_attachment_files(int limit, Cancellable? cancellable) throws Error {
+        if (limit <= 0)
+            return 0;
+        
+        int deleted = 0;
+        yield db.exec_transaction_async(Db.TransactionType.RW, (cx) => {
+            Db.Statement stmt = cx.prepare("""
+                SELECT id, filename
+                FROM DeleteAttachmentFileTable
+                LIMIT ?
+            """);
+            stmt.bind_int(0, limit);
+            
+            // build SQL for removing successfully-deleted files from table
+            StringBuilder sql = new StringBuilder("""
+                DELETE FROM DeleteAttachmentFileTable
+                WHERE id IN (
+            """);
+            
+            Db.Result result = stmt.exec(cancellable);
+            bool first = true;
+            while (!result.finished) {
+                int64 id = result.rowid_at(0);
+                string filename = result.string_at(1);
+                
+                File file = File.new_for_path(filename);
+                
+                // if it deletes, great; if not, we tried
+                try {
+                    file.delete(cancellable);
+                } catch (Error err) {
+                    if (err is IOError.CANCELLED)
+                        throw err;
+                    
+                    debug("[%s] Unable to delete reaped attachment file \"%s\": %s", to_string(),
+                        file.get_path(), err.message);
+                }
+                
+                if (!first)
+                    sql.append(", ");
+                
+                sql.append(id.to_string());
+                first = false;
+                
+                deleted++;
+                
+                result.next(cancellable);
+            }
+            
+            sql.append(")");
+            
+            // if any files were deleted, remove them from the table
+            if (deleted > 0)
+                cx.exec(sql.str);
+            
+            return Db.TransactionOutcome.COMMIT;
+        }, cancellable);
+        
+        return deleted;
+    }
+    
+    private async int delete_empty_attachment_directories_async(File? current, out bool empty,
+        Cancellable? cancellable) throws Error {
+        File current_dir = current ?? Attachment.get_attachments_dir(db.db_file.get_parent());
+        
+        // directory is considered empty until file or non-deleted child directory is found
+        empty = true;
+        
+        int deleted = 0;
+        FileEnumerator file_enum = yield current_dir.enumerate_children_async("*",
+            FileQueryInfoFlags.NOFOLLOW_SYMLINKS, priority, cancellable);
+        for (;;) {
+            List<FileInfo> infos = yield file_enum.next_files_async(ENUM_DIR_PER, priority, cancellable);
+            if (infos.length() == 0)
+                break;
+            
+            foreach (FileInfo info in infos) {
+                if (info.get_file_type() != FileType.DIRECTORY) {
+                    empty = false;
+                    
+                    continue;
+                }
+                
+                File child = current_dir.get_child(info.get_name());
+                
+                bool child_empty;
+                deleted += yield delete_empty_attachment_directories_async(child, out child_empty,
+                    cancellable);
+                if (!child_empty) {
+                    empty = false;
+                    
+                    continue;
+                }
+                
+                string? failure = null;
+                try {
+                    if (!yield child.delete_async(priority, cancellable))
+                        failure = "delete indicates not empty";
+                } catch (Error err) {
+                    if (err is IOError.CANCELLED)
+                        throw err;
+                    
+                    failure = err.message;
+                }
+                
+                if (failure == null) {
+                    deleted++;
+                } else {
+                    message("[%s] Unable to delete empty attachment directory \"%s\": %s",
+                        to_string(), child.get_path(), failure);
+                    
+                    // since it remains, directory not empty
+                    empty = false;
+                }
+            }
+        }
+        
+        yield file_enum.close_async(priority, cancellable);
+        
+        return deleted;
+    }
+    
+    public string to_string() {
+        return "GC:%s".printf(db.db_file.get_path());
+    }
+}
+


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]