[devdocsgjs/main: 608/1867] basic scrapping working




commit cda737ceec8992a1af0307b7f80278a1d6af7125
Author: Mathieu PATUREL <australie p gmail com>
Date:   Fri Apr 26 20:04:13 2019 +1000

    basic scrapping working

 .gitignore                          |  1 +
 lib/docs/filters/trio/clean_html.rb | 21 +++++++++++
 lib/docs/filters/trio/entries.rb    | 21 +++++++++++
 lib/docs/scrapers/trio.rb           | 24 +++++++++++++
 public/docs/docs.json               | 70 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 136 insertions(+), 1 deletion(-)
---
diff --git a/.gitignore b/.gitignore
index 1060fcf0..27f04dd4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ public/fonts
 public/docs/**/*
 docs/**/*
 !docs/*.md
+vendor
diff --git a/lib/docs/filters/trio/clean_html.rb b/lib/docs/filters/trio/clean_html.rb
new file mode 100644
index 00000000..5c2ef228
--- /dev/null
+++ b/lib/docs/filters/trio/clean_html.rb
@@ -0,0 +1,21 @@
+module Docs
+  class Trio
+    class CleanHtmlFilter < Filter
+      def call
+        @doc = at_css('div[role="main"]')
+        css('.section, [itemprop=articleBody]').each do |node|
+          node.replace node.children
+        end
+
+        css('.headerlink').remove
+
+        css('dt').each do |node|
+          new_node = doc.document.create_element "h3"
+          new_node.content = node.inner_text[0...-1]
+          node.replace new_node
+        end
+        doc
+      end
+    end
+  end
+end
diff --git a/lib/docs/filters/trio/entries.rb b/lib/docs/filters/trio/entries.rb
new file mode 100644
index 00000000..64387d6e
--- /dev/null
+++ b/lib/docs/filters/trio/entries.rb
@@ -0,0 +1,21 @@
+module Docs
+  class Trio
+    class EntriesFilter < Docs::EntriesFilter
+      def get_name
+        at_css('h1').text[0...-1]
+      end
+
+      def get_type
+        at_css('h1').text[0...-1]
+      end
+
+      def additional_entries
+        css('.descname').each_with_object [] do |node, entries|
+          name = node.previous.text + node.text
+          id = node.parent['id']
+          entries << [name, id]
+        end
+      end
+    end
+  end
+end
diff --git a/lib/docs/scrapers/trio.rb b/lib/docs/scrapers/trio.rb
new file mode 100644
index 00000000..cf90f77d
--- /dev/null
+++ b/lib/docs/scrapers/trio.rb
@@ -0,0 +1,24 @@
+module Docs
+  class Trio < UrlScraper
+    self.type = 'simple'
+    self.release = '0.11'
+    self.base_url = 'https://trio.readthedocs.io/en/latest/'
+    self.root_path = 'index.html'
+    self.links = {
+      home: 'https://trio.readthedocs.io/',
+      code: 'https://github.com/python-trio/trio'
+    }
+
+    html_filters.push 'trio/entries', 'trio/clean_html'
+
+    options[:attribution] = <<-HTML
+    HTML
+    options[:only_patterns] = [
+      /reference-core/,
+      /reference-io/,
+      /reference-testing/,
+      /reference-hazmat/,
+    ]
+
+  end
+end
diff --git a/public/docs/docs.json b/public/docs/docs.json
index 0637a088..eff8727c 100644
--- a/public/docs/docs.json
+++ b/public/docs/docs.json
@@ -1 +1,69 @@
-[]
\ No newline at end of file
+[
+  {
+    "name": "Chef",
+    "slug": "chef~12",
+    "type": "sphinx_simple",
+    "links": {
+      "home": "https://www.chef.io/";,
+      "code": "https://github.com/chef/chef";
+    },
+    "version": "12",
+    "release": "12.13",
+    "mtime": 1556264506,
+    "db_size": 7170006
+  },
+  {
+    "name": "CSS",
+    "slug": "css",
+    "type": "mdn",
+    "mtime": 1543099045,
+    "db_size": 12415944
+  },
+  {
+    "name": "DOM",
+    "slug": "dom",
+    "type": "mdn",
+    "mtime": 1543157862,
+    "db_size": 33998524
+  },
+  {
+    "name": "DOM Events",
+    "slug": "dom_events",
+    "type": "mdn",
+    "mtime": 1543099589,
+    "db_size": 1752500
+  },
+  {
+    "name": "HTML",
+    "slug": "html",
+    "type": "mdn",
+    "mtime": 1543097764,
+    "db_size": 4141596
+  },
+  {
+    "name": "HTTP",
+    "slug": "http",
+    "type": "mdn",
+    "mtime": 1543099392,
+    "db_size": 4731727
+  },
+  {
+    "name": "JavaScript",
+    "slug": "javascript",
+    "type": "mdn",
+    "mtime": 1543098529,
+    "db_size": 6462141
+  },
+  {
+    "name": "Trio",
+    "slug": "trio",
+    "type": "simple",
+    "links": {
+      "home": "https://trio.readthedocs.io/";,
+      "code": "https://github.com/python-trio/trio";
+    },
+    "release": "0.11",
+    "mtime": 1556272773,
+    "db_size": 736670
+  }
+]
\ No newline at end of file


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]