[devdocsgjs/main: 1340/1867] Cleanup and improve TypeScript scraper
- From: Andy Holmes <andyholmes src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [devdocsgjs/main: 1340/1867] Cleanup and improve TypeScript scraper
- Date: Fri, 19 Nov 2021 23:47:54 +0000 (UTC)
commit 7e36848e4515913a0466c82e4ad4c71003a7cf7c
Author: Phil Scherer <pnscher evoforge org>
Date: Sun Dec 6 06:39:30 2020 +0000
Cleanup and improve TypeScript scraper
lib/docs/filters/typescript/clean_html.rb | 34 ++++++++++++-------------------
lib/docs/filters/typescript/entries.rb | 9 ++------
lib/docs/scrapers/typescript.rb | 20 ++++++++++--------
3 files changed, 26 insertions(+), 37 deletions(-)
---
diff --git a/lib/docs/filters/typescript/clean_html.rb b/lib/docs/filters/typescript/clean_html.rb
index 49d4a89f..de52765a 100644
--- a/lib/docs/filters/typescript/clean_html.rb
+++ b/lib/docs/filters/typescript/clean_html.rb
@@ -2,20 +2,23 @@ module Docs
class Typescript
class CleanHtmlFilter < Filter
def call
+ root_page? ? root : other
+ doc
+ end
- # Top menu bar
- css('#top-menu').remove
- css('.skip-to-main').remove
+ def root
+ header = at_css('h1')
+ header.parent.before(header).remove
- # Sidebar
- css('#sidebar').remove
+ css('h4').each do |node|
+ node.name = 'h2'
+ end
+ end
- # Pound symbol before each title
- css('.anchor').remove
+ def other
+ @doc = at_css('article > .whitespace > .markdown')
- css('#handbook-content > h2').each do |node|
- node.name = 'h1'
- end
+ css('.anchor').remove
css('a:contains("Try")').remove
css('pre').each do |node|
@@ -23,17 +26,6 @@ module Docs
node['data-language'] = 'typescript'
node.remove_attribute('class')
end
-
- # 'Next' title area
- css('.whitespace-tight').remove
-
- # Right side floating box
- css('.handbook-toc').remove
-
- css('#site-footer').remove
-
- doc
-
end
end
end
diff --git a/lib/docs/filters/typescript/entries.rb b/lib/docs/filters/typescript/entries.rb
index f6c8d94d..eec6439e 100644
--- a/lib/docs/filters/typescript/entries.rb
+++ b/lib/docs/filters/typescript/entries.rb
@@ -3,7 +3,6 @@ module Docs
class EntriesFilter < Docs::EntriesFilter
def get_name
- return 'Typescript' if current_url == root_url
return at_css('h2').content
end
@@ -12,13 +11,9 @@ module Docs
end
def additional_entries
- entries = []
-
- css('h2').each do |node|
- entries << [node.content, node['id'], name]
+ css('h2').each_with_object [] do |node,entries|
+ entries << [node.content, node['id'], name]
end
-
- entries
end
end
diff --git a/lib/docs/scrapers/typescript.rb b/lib/docs/scrapers/typescript.rb
index 5d52b723..80493e08 100644
--- a/lib/docs/scrapers/typescript.rb
+++ b/lib/docs/scrapers/typescript.rb
@@ -3,23 +3,25 @@ module Docs
self.name = 'TypeScript'
self.type = 'simple'
self.release = '4.1.2'
- self.base_url = 'https://www.typescriptlang.org/docs/handbook'
+ self.base_url = 'https://www.typescriptlang.org/docs/handbook/'
self.root_path = 'index.html'
self.links = {
home: 'https://www.typescriptlang.org',
code: 'https://github.com/Microsoft/TypeScript'
}
- html_filters.push 'typescript/entries', 'typescript/clean_html'
+ html_filters.push 'typescript/entries', 'typescript/clean_html', 'title'
+
+ options[:container] = 'main'
options[:skip] = [
- '/react-&-webpack.html',
- '/asp-net-core.html',
- '/gulp.html',
- '/dom-manipulation.html',
- '/migrating-from-javascript.html',
- '/babel-with-typescript.html',
- '/intro.html'
+ 'react-&-webpack.html',
+ 'asp-net-core.html',
+ 'gulp.html',
+ 'dom-manipulation.html',
+ 'migrating-from-javascript.html',
+ 'babel-with-typescript.html',
+ 'intro.html'
]
options[:skip_patterns] = [
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]