[devdocsgjs/main: 1339/1867] Cleanup, version, and improve Relay scraper




commit 7c01b590f07b2bb8bb6ee649e82314844804c8fd
Author: Phil Scherer <pnscher evoforge org>
Date:   Sat Dec 5 06:34:00 2020 +0000

    Cleanup, version, and improve Relay scraper

 lib/docs/filters/relay/clean_html.rb | 31 ++++---------------
 lib/docs/filters/relay/entries.rb    | 58 +++++++++++++++++-------------------
 lib/docs/scrapers/relay.rb           | 34 +++++++++++----------
 3 files changed, 50 insertions(+), 73 deletions(-)
---
diff --git a/lib/docs/filters/relay/clean_html.rb b/lib/docs/filters/relay/clean_html.rb
index f18d30ba..e3e7c3a1 100644
--- a/lib/docs/filters/relay/clean_html.rb
+++ b/lib/docs/filters/relay/clean_html.rb
@@ -2,38 +2,17 @@ module Docs
   class Relay
     class CleanHtmlFilter < Filter
       def call
+        @doc = at_css('.post')
 
-        if slug == 'index'
-          css('img').remove
+        header = at_css('h1')
+        header.parent.before(header).remove
 
-          css('.projectTitle').each do |node|
-            node.name = 'h1'
-            node.content = 'Relay'
-          end
-
-          css('pre').remove
-
-        end
-
-        css('.docLastUpdate').remove
-
-        css('.docs-prevnext').remove
-
-        css('.edit-page-link').remove
+        css('footer').remove
 
         css('h2, h3').each do |node|
-          node.css('a').remove
-          node['id'] = node.content.gsub(/\s/, '-').downcase
+          node['id'] = node.at_css('a.anchor')['id']
         end
 
-        css('.onPageNav').remove
-
-        css('#docsNav').remove
-
-        css('.fixedHeaderContainer').remove
-
-        css('footer').remove
-
         # syntax highlight
         css('pre').each do |node|
           node['data-language'] = 'javascript'
diff --git a/lib/docs/filters/relay/entries.rb b/lib/docs/filters/relay/entries.rb
index 7f7c6859..99f33543 100644
--- a/lib/docs/filters/relay/entries.rb
+++ b/lib/docs/filters/relay/entries.rb
@@ -1,51 +1,47 @@
 module Docs
   class Relay
     class EntriesFilter < Docs::EntriesFilter
-
-      def get_name
-        if slug == 'index'
-          return 'Relay'
+      ONLY_SECTIONS = ['API Reference', 'Principles & Architecture']
+      ONLY_SLUGS = []
+
+      def call
+        if root_page?
+          css('.navGroup > h3').each do |node|
+            next if not ONLY_SECTIONS.include? node.content
+            node.next_element.css('a').each do |anchor|
+              ONLY_SLUGS << anchor['href'].split('/').last.strip
+            end
+          end
         end
+        super
+      end
 
+      def get_name
         at_css('h1').content
       end
 
       def get_type
-        if slug == 'index'
-          return 'Relay'
-        end
-
         at_css('h1').content
       end
 
-      def additional_entries
-        entries = []
-
-        if slug == 'index'
-          return entries
-        end
-
-        ## avoid adding non-desired entries removing tags
-        # remove header which contains a <h2> tag
-        css('.fixedHeaderContainer').remove
+      def include_default_entry?
+        ONLY_SLUGS.include? slug
+      end
 
-        # remove table of content whose title is an <h2> tag
-        css('.toc').remove
-        ##
+      def additional_entries
+        return [] if not include_default_entry?
 
-        css('h2, h3').each do |node|
-          next if node.content.include?('Argument')
-          entry_name = node.content
+        css('article h2, article h3').each_with_object [] do |node, entries|
+          next if node.content.include?('Argument') ||
+                  node.content.starts_with?('Example')
 
-          if entry_name.include?('(')
-            entry_name = entry_name.match(/.*\(/)[0] + ')'
+          name = node.content
+          if name.include?('(')
+            name = name.match(/.*\(/)[0] + ')'
           end
-
-          entry_id = node.content.gsub(/\s/, '-').downcase
-          entries << [entry_name, entry_id]
+          id = node.at_css('a.anchor')['id']
+          entries << [name, id]
         end
-
-        entries
       end
 
     end
diff --git a/lib/docs/scrapers/relay.rb b/lib/docs/scrapers/relay.rb
index 8d01b3bc..0b3f6b8a 100644
--- a/lib/docs/scrapers/relay.rb
+++ b/lib/docs/scrapers/relay.rb
@@ -1,9 +1,7 @@
 module Docs
   class Relay < UrlScraper
     self.type = 'simple'
-    self.release = '10.1.0'
-    self.base_url = 'https://relay.dev'
-    self.root_path = 'index.html'
+    self.root_path = 'introduction-to-relay'
     self.links = {
       home: 'https://relay.dev/',
       code: 'https://github.com/facebook/relay'
@@ -11,19 +9,7 @@ module Docs
 
     html_filters.push 'relay/entries', 'relay/clean_html'
 
-    options[:only] = [
-      '/docs/en/graphql-in-relay',
-      '/docs/en//relay-environment',
-      '/docs/en/network-layer',
-      '/docs/en/query-renderer',
-      '/docs/en/fragment-container',
-      '/docs/en/refetch-container',
-      '/docs/en/pagination-container',
-      '/docs/en/mutations',
-      '/docs/en/subscriptions',
-      '/docs/en/relay-store',
-      '/docs/en/fetch-query'
-    ]
+    options[:skip] = %w(videos)
 
     options[:attribution] = <<-HTML
       &copy; 2020&ndash;present Facebook Inc.<br>
@@ -34,5 +20,21 @@ module Docs
       get_latest_github_release('facebook', 'relay', opts)
     end
 
+    version '10' do
+      self.release = '10.1.0'
+      self.base_url = "https://relay.dev/docs/en/";
+      # For some reason, the most-recent version isn't available at a versioned URL
+    end
+
+    version '9' do
+      self.release = '9.1.0'
+      self.base_url = "https://relay.dev/docs/en/v#{self.release}/";
+    end
+
+    version '8' do
+      self.release = '8.0.0'
+      self.base_url = "https://relay.dev/docs/en/v#{self.release}/";
+    end
+
   end
 end


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]