[devdocsgjs/main: 777/1867] scala: finish scraper and filters
- From: Andy Holmes <andyholmes src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [devdocsgjs/main: 777/1867] scala: finish scraper and filters
- Date: Fri, 19 Nov 2021 23:47:28 +0000 (UTC)
commit 6614375671eeedb7634034a7e36f966b5072610e
Author: Jasper van Merle <jaspervmerle gmail com>
Date: Sun Aug 11 19:53:12 2019 +0200
scala: finish scraper and filters
lib/docs/filters/scala/clean_html.rb | 114 +++++++++++++++++--------------
lib/docs/filters/scala/clean_html_210.rb | 32 ---------
lib/docs/filters/scala/clean_html_212.rb | 36 ----------
lib/docs/filters/scala/entries.rb | 48 +++++++++++--
lib/docs/scrapers/scala.rb | 98 +++++++++++---------------
5 files changed, 145 insertions(+), 183 deletions(-)
---
diff --git a/lib/docs/filters/scala/clean_html.rb b/lib/docs/filters/scala/clean_html.rb
index 95097c80..0320932d 100644
--- a/lib/docs/filters/scala/clean_html.rb
+++ b/lib/docs/filters/scala/clean_html.rb
@@ -2,97 +2,107 @@ module Docs
class Scala
class CleanHtmlFilter < Filter
def call
+ @doc = at_css('#content')
+
always
+ add_title
- if slug == 'index'
- root
- else
- other
- end
+ doc
end
def always
- # remove deprecated sections
+ # Remove deprecated sections
css('.members').each do |members|
header = members.at_css('h3')
members.remove if header.text.downcase.include? 'deprecate'
end
- # Some of this is just for 2.12
- # These are things that provide interactive features, which are not supported yet.
- css('#subpackage-spacer, #search, #mbrsel, .diagram-btn').remove
- css('#footer').remove
- css('.toggleContainer').remove
+
+ css('#mbrsel, #footer').remove
+
+ css('.diagram-container').remove
+ css('.toggleContainer > .toggle').each do |node|
+ title = node.at_css('span')
+ next if title.nil?
+
+ content = node.at_css('.hiddenContent')
+ next if content.nil?
+
+ title.name = 'dt'
+
+ content.remove_attribute('class')
+ content.remove_attribute('style')
+ content.name = 'dd'
+
+ attributes = at_css('.attributes')
+ unless attributes.nil?
+ title.parent = attributes
+ content.parent = attributes
+ end
+ end
signature = at_css('#signature')
- signature.replace %Q|
- <h2 id="signature">#{signature.inner_html}</h2>
- |
+ signature.replace "<h2 id=\"signature\">#{signature.inner_html}</h2>"
css('div.members > h3').each do |node|
- change_tag! 'h2', node
+ node.name = 'h2'
end
css('div.members > ol').each do |list|
list.css('li').each do |li|
h3 = doc.document.create_element 'h3'
+ h3['id'] = li['name'].rpartition('#').last unless li['name'].nil?
+
li.prepend_child h3
li.css('.shortcomment').remove
+
modifier = li.at_css('.modifier_kind')
- modifier.parent = h3 if modifier
+ modifier.parent = h3 unless modifier.nil?
+
+ kind = li.at_css('.modifier_kind .kind')
+ kind.content = kind.content + ' ' unless kind.nil?
+
symbol = li.at_css('.symbol')
- symbol.parent = h3 if symbol
+ symbol.parent = h3 unless symbol.nil?
+
li.swap li.children
end
+
list.swap list.children
end
- pres = css('.fullcomment pre, .fullcommenttop pre')
- pres.each do |pre|
+ css('.fullcomment pre, .fullcommenttop pre').each do |pre|
pre['data-language'] = 'scala'
+ pre.content = pre.content
end
- pres.add_class 'language-scala'
-
-
-
- doc
-
- end
-
- def root
- css('#filter').remove # these are filters to search through the types and packages
- css('#library').remove # these are icons at the top
- doc
- end
- def other
- # these are sections of the documentation which do not seem useful
+ # Sections of the documentation which do not seem useful
%w(#inheritedMembers #groupedMembers .permalink .hiddenContent .material-icons).each do |selector|
css(selector).remove
end
- # This is the kind of thing we have, class, object, trait
- kind = at_css('.modifier_kind .kind').content
- # this image replacement doesn't do anything on 2.12 docs
- img = at_css('img')
- img.replace %Q|<span class="img_kind">#{kind}</span>| unless img.nil?
- class_to_add = kind == 'object' ? 'value': 'type'
+ # Things that are not shown on the site, like deprecated members
+ css('li[visbl=prt]').remove
+ end
+
+ def add_title
+ css('.permalink').remove
- # for 2.10, 2.11, the kind class is associated to the body. we have to
- # add it somewhere, so we do that with the #definition.
- definition = css('#definition')
- definition.css('.big_circle').remove
- definition.add_class class_to_add
+ definition = at_css('#definition')
+ return if definition.nil?
- # this is something that is not shown on the site, such as deprecated members
- css('li[visbl=prt]').remove
+ type_full_name = {a: 'Annotation', c: 'Class', t: 'Trait', o: 'Object', p: 'Package'}
+ type = type_full_name[definition.at_css('.big-circle').text.to_sym]
+ name = CGI.escapeHTML definition.at_css('h1').text
- doc
- end
+ package = definition.at_css('#owner').text rescue ''
+ package = package + '.' unless name.empty? || package.empty?
- private
+ other = definition.at_css('.morelinks').dup
+ other_content = other ? "<h3>#{other.to_html}</h3>" : ''
- def change_tag!(new_tag, node)
- node.replace %Q|<#{new_tag}>#{node.inner_html}</#{new_tag}>|
+ title_content = root_page? ? 'Package root' : "#{type} #{package}#{name}".strip
+ title = "<h1>#{title_content}</h1>"
+ definition.replace title + other_content
end
end
end
diff --git a/lib/docs/filters/scala/entries.rb b/lib/docs/filters/scala/entries.rb
index d328764c..98eb9781 100644
--- a/lib/docs/filters/scala/entries.rb
+++ b/lib/docs/filters/scala/entries.rb
@@ -1,14 +1,30 @@
module Docs
class Scala
class EntriesFilter < Docs::EntriesFilter
+ REPLACEMENTS = {
+ '$eq' => '=',
+ '$colon' => ':',
+ '$less' => '<',
+ }
+
def get_name
- # this first condition is mainly for scala 212 docs, which
- # have their package listing as index.html
if is_package?
symbol = at_css('#definition h1')
symbol ? symbol.text.gsub(/\W+/, '') : "package"
else
- slug.split('/').last
+ name = slug.split('/').last
+
+ # Some objects have inner objects, show ParentObject$.ChildObject$ instead of
ParentObject$$ChildObject$
+ name = name.gsub('$$', '$.')
+
+ # If a dollar sign is used as separator between two characters, replace it with a dot
+ name = name.gsub(/([^$.])\$([^$.])/, '\1.\2')
+
+ REPLACEMENTS.each do |key, value|
+ name = name.gsub(key, value)
+ end
+
+ name
end
end
@@ -26,6 +42,31 @@ module Docs
true
end
+ def additional_entries
+ entries = []
+
+ full_name = "#{type}.#{name}".remove('$')
+ css(".members li[name^=\"#{full_name}\"]").each do |node|
+ # Ignore packages
+ kind = node.at_css('.modifier_kind > .kind')
+ next if !kind.nil? && kind.content == 'package'
+
+ # Ignore deprecated members
+ next unless node.at_css('.symbol > .name.deprecated').nil?
+
+ id = node['name'].rpartition('#').last
+ member_name = node.at_css('.name')
+
+ # Ignore members only existing of hashtags, we can't link to that
+ next if member_name.nil? || member_name.content.strip.remove('#').blank?
+
+ member = "#{name}.#{member_name.content}()"
+ entries << [member, id]
+ end
+
+ entries
+ end
+
private
# For the package name, we use the slug rather than parsing the package
@@ -40,7 +81,6 @@ module Docs
end
def parent_package
- name = package_name
parent = package_drop_last(package_name.split('.'))
parent.empty? ? '_root_' : parent
end
diff --git a/lib/docs/scrapers/scala.rb b/lib/docs/scrapers/scala.rb
index 6b6d6bb2..e831fa84 100644
--- a/lib/docs/scrapers/scala.rb
+++ b/lib/docs/scrapers/scala.rb
@@ -1,80 +1,60 @@
module Docs
class Scala < FileScraper
- include FixInternalUrlsBehavior
-
- self.name = 'scala'
+ self.name = 'Scala'
self.type = 'scala'
self.links = {
home: 'http://www.scala-lang.org/',
code: 'https://github.com/scala/scala'
}
- version '2.12 Library' do
- self.release = '2.12.3'
- self.dir = '/Users/Thibaut/DevDocs/Docs/Scala212/api/scala-library' #
https://downloads.lightbend.com/scala/2.12.3/scala-docs-2.12.3.zip
- self.base_url = 'http://www.scala-lang.org/api/2.12.3/'
+ options[:container] = '#content-container'
+ options[:attribution] = <<-HTML
+ © 2002-2019 EPFL, with contributions from Lightbend.
+ HTML
+
+ # https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
+ # Extract api/scala-library into docs/scala~2.13_library
+ version '2.13 Library' do
+ self.release = '2.13.0'
+ self.base_url = 'https://www.scala-lang.org/api/2.13.0/'
self.root_path = 'index.html'
- options[:attribution] = <<-HTML
- Scala programming documentation. Copyright (c) 2003-2017 <a
- href="http://www.epfl.ch" target="_blank">EPFL</a>, with contributions from <a
- href="http://www.lightbend.com" target="_blank">Lightbend</a>.
- HTML
- html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_212'
+
+ html_filters.push 'scala/entries', 'scala/clean_html'
end
- version '2.12 Reflection' do
- self.release = '2.12.3'
- self.dir = '/Users/Thibaut/DevDocs/Docs/Scala212/api/scala-reflect' #
https://downloads.lightbend.com/scala/2.12.3/scala-docs-2.12.3.zip
- self.base_url = 'http://www.scala-lang.org/api/2.12.3/scala-reflect/'
+ # https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
+ # Extract api/scala-reflect into docs/scala~2.13_reflection
+ version '2.13 Reflection' do
+ self.release = '2.13.0'
+ self.base_url = 'https://www.scala-lang.org/api/2.13.0/scala-reflect/'
self.root_path = 'index.html'
- options[:attribution] = <<-HTML
- Scala programming documentation. Copyright (c) 2003-2017 <a
- href="http://www.epfl.ch" target="_blank">EPFL</a>, with contributions from <a
- href="http://www.lightbend.com" target="_blank">Lightbend</a>.
- HTML
- html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_212'
+
+ html_filters.push 'scala/entries', 'scala/clean_html'
end
- version '2.11 Library' do
- self.release = '2.11.8'
- self.dir = '/Users/Thibaut/DevDocs/Docs/Scala211/api/scala-library' #
https://downloads.lightbend.com/scala/2.11.8/scala-docs-2.11.8.zip
- self.base_url = 'http://www.scala-lang.org/api/2.11.8/'
- self.root_path = 'package.html'
- options[:skip_patterns] = [/^index.html/, /index\/index-/]
- options[:attribution] = <<-HTML
- Scala programming documentation. Copyright (c) 2003-2016 <a
- href="http://www.epfl.ch" target="_blank">EPFL</a>, with contributions from <a
- href="http://www.lightbend.com" target="_blank">Lightbend</a>.
- HTML
- html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_210'
+ # https://downloads.lightbend.com/scala/2.12.6/scala-docs-2.12.6.zip
+ # Extract api/scala-library into docs/scala~2.12_library
+ version '2.12 Library' do
+ self.release = '2.12.6'
+ self.base_url = 'https://www.scala-lang.org/api/2.12.6/'
+ self.root_path = 'index.html'
+
+ html_filters.push 'scala/entries', 'scala/clean_html'
end
- version '2.11 Reflection' do
- self.release = '2.11.8'
- self.dir = '/Users/Thibaut/DevDocs/Docs/Scala211/api/scala-reflect' #
https://downloads.lightbend.com/scala/2.11.8/scala-docs-2.11.8.zip
- self.base_url = 'http://www.scala-lang.org/api/2.11.8/scala-reflect/'
- self.root_path = 'package.html'
- options[:skip_patterns] = [/^index.html/, /index\/index-/]
- options[:attribution] = <<-HTML
- Scala programming documentation. Copyright (c) 2003-2016 <a
- href="http://www.epfl.ch" target="_blank">EPFL</a>, with contributions from <a
- href="http://www.lightbend.com" target="_blank">Lightbend</a>.
- HTML
- html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_210'
+ # https://downloads.lightbend.com/scala/2.12.6/scala-docs-2.12.6.zip
+ # Extract api/scala-reflect into docs/scala~2.12_reflection
+ version '2.12 Reflection' do
+ self.release = '2.12.6'
+ self.base_url = 'https://www.scala-lang.org/api/2.12.6/scala-reflect/'
+ self.root_path = 'index.html'
+
+ html_filters.push 'scala/entries', 'scala/clean_html'
end
- version '2.10' do
- self.release = '2.10.6'
- self.dir = '/Users/Thibaut/DevDocs/Docs/Scala210' #
https://downloads.lightbend.com/scala/2.10.6/scala-docs-2.10.6.zip
- self.base_url = 'http://www.scala-lang.org/api/2.10.6/'
- self.root_path = 'package.html'
- options[:skip_patterns] = [/^index.html/, /index\/index-/]
- options[:attribution] = <<-HTML
- Scala programming documentation. Copyright (c) 2003-2013 <a
- href="http://www.epfl.ch" target="_blank">EPFL</a>, with contributions from <a
- href="http://typesafe.com" target="_blank">Typesafe</a>.
- HTML
- html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_210'
+ def get_latest_version(opts)
+ doc = fetch_doc('https://www.scala-lang.org/api/current/', opts)
+ doc.at_css('#doc-version').content
end
end
end
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]