[devdocsgjs/main: 1340/1867] Cleanup and improve TypeScript scraper




commit 7e36848e4515913a0466c82e4ad4c71003a7cf7c
Author: Phil Scherer <pnscher evoforge org>
Date:   Sun Dec 6 06:39:30 2020 +0000

    Cleanup and improve TypeScript scraper

 lib/docs/filters/typescript/clean_html.rb | 34 ++++++++++++-------------------
 lib/docs/filters/typescript/entries.rb    |  9 ++------
 lib/docs/scrapers/typescript.rb           | 20 ++++++++++--------
 3 files changed, 26 insertions(+), 37 deletions(-)
---
diff --git a/lib/docs/filters/typescript/clean_html.rb b/lib/docs/filters/typescript/clean_html.rb
index 49d4a89f..de52765a 100644
--- a/lib/docs/filters/typescript/clean_html.rb
+++ b/lib/docs/filters/typescript/clean_html.rb
@@ -2,20 +2,23 @@ module Docs
   class Typescript
     class CleanHtmlFilter < Filter
       def call
+        root_page? ? root : other
+        doc
+      end
 
-        # Top menu bar
-        css('#top-menu').remove
-        css('.skip-to-main').remove
+      def root
+        header = at_css('h1')
+        header.parent.before(header).remove
 
-        # Sidebar
-        css('#sidebar').remove
+        css('h4').each do |node|
+          node.name = 'h2'
+        end
+      end
 
-        # Pound symbol before each title
-        css('.anchor').remove
+      def other
+        @doc = at_css('article > .whitespace > .markdown')
 
-        css('#handbook-content > h2').each do |node|
-          node.name = 'h1'
-        end
+        css('.anchor').remove
 
         css('a:contains("Try")').remove
         css('pre').each do |node|
@@ -23,17 +26,6 @@ module Docs
           node['data-language'] = 'typescript'
           node.remove_attribute('class')
         end
-
-        # 'Next' title area
-        css('.whitespace-tight').remove
-
-        # Right side floating box
-        css('.handbook-toc').remove
-
-        css('#site-footer').remove
-
-        doc
-
       end
     end
   end
diff --git a/lib/docs/filters/typescript/entries.rb b/lib/docs/filters/typescript/entries.rb
index f6c8d94d..eec6439e 100644
--- a/lib/docs/filters/typescript/entries.rb
+++ b/lib/docs/filters/typescript/entries.rb
@@ -3,7 +3,6 @@ module Docs
     class EntriesFilter < Docs::EntriesFilter
 
       def get_name
-        return 'Typescript' if current_url == root_url
         return at_css('h2').content
       end
 
@@ -12,13 +11,9 @@ module Docs
       end
 
       def additional_entries
-        entries = []
-
-        css('h2').each do |node|
-            entries << [node.content, node['id'], name]
+        css('h2').each_with_object [] do |node,entries|
+          entries << [node.content, node['id'], name]
         end
-
-        entries
       end
 
     end
diff --git a/lib/docs/scrapers/typescript.rb b/lib/docs/scrapers/typescript.rb
index 5d52b723..80493e08 100644
--- a/lib/docs/scrapers/typescript.rb
+++ b/lib/docs/scrapers/typescript.rb
@@ -3,23 +3,25 @@ module Docs
     self.name = 'TypeScript'
     self.type = 'simple'
     self.release = '4.1.2'
-    self.base_url = 'https://www.typescriptlang.org/docs/handbook'
+    self.base_url = 'https://www.typescriptlang.org/docs/handbook/'
     self.root_path = 'index.html'
     self.links = {
       home: 'https://www.typescriptlang.org',
       code: 'https://github.com/Microsoft/TypeScript'
     }
 
-    html_filters.push 'typescript/entries', 'typescript/clean_html'
+    html_filters.push 'typescript/entries', 'typescript/clean_html', 'title'
+
+    options[:container] = 'main'
 
     options[:skip] = [
-      '/react-&-webpack.html',
-      '/asp-net-core.html',
-      '/gulp.html',
-      '/dom-manipulation.html',
-      '/migrating-from-javascript.html',
-      '/babel-with-typescript.html',
-      '/intro.html'
+      'react-&-webpack.html',
+      'asp-net-core.html',
+      'gulp.html',
+      'dom-manipulation.html',
+      'migrating-from-javascript.html',
+      'babel-with-typescript.html',
+      'intro.html'
     ]
 
     options[:skip_patterns] = [


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]