[devdocsgjs/main: 608/1867] basic scrapping working
- From: Andy Holmes <andyholmes src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [devdocsgjs/main: 608/1867] basic scrapping working
- Date: Fri, 19 Nov 2021 23:47:18 +0000 (UTC)
commit cda737ceec8992a1af0307b7f80278a1d6af7125
Author: Mathieu PATUREL <australie p gmail com>
Date: Fri Apr 26 20:04:13 2019 +1000
basic scrapping working
.gitignore | 1 +
lib/docs/filters/trio/clean_html.rb | 21 +++++++++++
lib/docs/filters/trio/entries.rb | 21 +++++++++++
lib/docs/scrapers/trio.rb | 24 +++++++++++++
public/docs/docs.json | 70 ++++++++++++++++++++++++++++++++++++-
5 files changed, 136 insertions(+), 1 deletion(-)
---
diff --git a/.gitignore b/.gitignore
index 1060fcf0..27f04dd4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ public/fonts
public/docs/**/*
docs/**/*
!docs/*.md
+vendor
diff --git a/lib/docs/filters/trio/clean_html.rb b/lib/docs/filters/trio/clean_html.rb
new file mode 100644
index 00000000..5c2ef228
--- /dev/null
+++ b/lib/docs/filters/trio/clean_html.rb
@@ -0,0 +1,21 @@
+module Docs
+ class Trio
+ class CleanHtmlFilter < Filter
+ def call
+ @doc = at_css('div[role="main"]')
+ css('.section, [itemprop=articleBody]').each do |node|
+ node.replace node.children
+ end
+
+ css('.headerlink').remove
+
+ css('dt').each do |node|
+ new_node = doc.document.create_element "h3"
+ new_node.content = node.inner_text[0...-1]
+ node.replace new_node
+ end
+ doc
+ end
+ end
+ end
+end
diff --git a/lib/docs/filters/trio/entries.rb b/lib/docs/filters/trio/entries.rb
new file mode 100644
index 00000000..64387d6e
--- /dev/null
+++ b/lib/docs/filters/trio/entries.rb
@@ -0,0 +1,21 @@
+module Docs
+ class Trio
+ class EntriesFilter < Docs::EntriesFilter
+ def get_name
+ at_css('h1').text[0...-1]
+ end
+
+ def get_type
+ at_css('h1').text[0...-1]
+ end
+
+ def additional_entries
+ css('.descname').each_with_object [] do |node, entries|
+ name = node.previous.text + node.text
+ id = node.parent['id']
+ entries << [name, id]
+ end
+ end
+ end
+ end
+end
diff --git a/lib/docs/scrapers/trio.rb b/lib/docs/scrapers/trio.rb
new file mode 100644
index 00000000..cf90f77d
--- /dev/null
+++ b/lib/docs/scrapers/trio.rb
@@ -0,0 +1,24 @@
+module Docs
+ class Trio < UrlScraper
+ self.type = 'simple'
+ self.release = '0.11'
+ self.base_url = 'https://trio.readthedocs.io/en/latest/'
+ self.root_path = 'index.html'
+ self.links = {
+ home: 'https://trio.readthedocs.io/',
+ code: 'https://github.com/python-trio/trio'
+ }
+
+ html_filters.push 'trio/entries', 'trio/clean_html'
+
+ options[:attribution] = <<-HTML
+ HTML
+ options[:only_patterns] = [
+ /reference-core/,
+ /reference-io/,
+ /reference-testing/,
+ /reference-hazmat/,
+ ]
+
+ end
+end
diff --git a/public/docs/docs.json b/public/docs/docs.json
index 0637a088..eff8727c 100644
--- a/public/docs/docs.json
+++ b/public/docs/docs.json
@@ -1 +1,69 @@
-[]
\ No newline at end of file
+[
+ {
+ "name": "Chef",
+ "slug": "chef~12",
+ "type": "sphinx_simple",
+ "links": {
+ "home": "https://www.chef.io/",
+ "code": "https://github.com/chef/chef"
+ },
+ "version": "12",
+ "release": "12.13",
+ "mtime": 1556264506,
+ "db_size": 7170006
+ },
+ {
+ "name": "CSS",
+ "slug": "css",
+ "type": "mdn",
+ "mtime": 1543099045,
+ "db_size": 12415944
+ },
+ {
+ "name": "DOM",
+ "slug": "dom",
+ "type": "mdn",
+ "mtime": 1543157862,
+ "db_size": 33998524
+ },
+ {
+ "name": "DOM Events",
+ "slug": "dom_events",
+ "type": "mdn",
+ "mtime": 1543099589,
+ "db_size": 1752500
+ },
+ {
+ "name": "HTML",
+ "slug": "html",
+ "type": "mdn",
+ "mtime": 1543097764,
+ "db_size": 4141596
+ },
+ {
+ "name": "HTTP",
+ "slug": "http",
+ "type": "mdn",
+ "mtime": 1543099392,
+ "db_size": 4731727
+ },
+ {
+ "name": "JavaScript",
+ "slug": "javascript",
+ "type": "mdn",
+ "mtime": 1543098529,
+ "db_size": 6462141
+ },
+ {
+ "name": "Trio",
+ "slug": "trio",
+ "type": "simple",
+ "links": {
+ "home": "https://trio.readthedocs.io/",
+ "code": "https://github.com/python-trio/trio"
+ },
+ "release": "0.11",
+ "mtime": 1556272773,
+ "db_size": 736670
+ }
+]
\ No newline at end of file
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]