[devdocsgjs/main: 777/1867] scala: finish scraper and filters




commit 6614375671eeedb7634034a7e36f966b5072610e
Author: Jasper van Merle <jaspervmerle gmail com>
Date:   Sun Aug 11 19:53:12 2019 +0200

    scala: finish scraper and filters

 lib/docs/filters/scala/clean_html.rb     | 114 +++++++++++++++++--------------
 lib/docs/filters/scala/clean_html_210.rb |  32 ---------
 lib/docs/filters/scala/clean_html_212.rb |  36 ----------
 lib/docs/filters/scala/entries.rb        |  48 +++++++++++--
 lib/docs/scrapers/scala.rb               |  98 +++++++++++---------------
 5 files changed, 145 insertions(+), 183 deletions(-)
---
diff --git a/lib/docs/filters/scala/clean_html.rb b/lib/docs/filters/scala/clean_html.rb
index 95097c80..0320932d 100644
--- a/lib/docs/filters/scala/clean_html.rb
+++ b/lib/docs/filters/scala/clean_html.rb
@@ -2,97 +2,107 @@ module Docs
   class Scala
     class CleanHtmlFilter < Filter
       def call
+        @doc = at_css('#content')
+
         always
+        add_title
 
-        if slug == 'index'
-          root
-        else
-          other
-        end
+        doc
       end
 
       def always
-        # remove deprecated sections
+        # Remove deprecated sections
         css('.members').each do |members|
           header = members.at_css('h3')
           members.remove if header.text.downcase.include? 'deprecate'
         end
-        # Some of this is just for 2.12
-        # These are things that provide interactive features, which are not supported yet.
-        css('#subpackage-spacer, #search, #mbrsel, .diagram-btn').remove
-        css('#footer').remove
-        css('.toggleContainer').remove
+
+        css('#mbrsel, #footer').remove
+
+        css('.diagram-container').remove
+        css('.toggleContainer > .toggle').each do |node|
+          title = node.at_css('span')
+          next if title.nil?
+
+          content = node.at_css('.hiddenContent')
+          next if content.nil?
+
+          title.name = 'dt'
+
+          content.remove_attribute('class')
+          content.remove_attribute('style')
+          content.name = 'dd'
+
+          attributes = at_css('.attributes')
+          unless attributes.nil?
+            title.parent = attributes
+            content.parent = attributes
+          end
+        end
 
         signature = at_css('#signature')
-        signature.replace %Q|
-          <h2 id="signature">#{signature.inner_html}</h2>
-        |
+        signature.replace "<h2 id=\"signature\">#{signature.inner_html}</h2>"
 
         css('div.members > h3').each do |node|
-          change_tag! 'h2', node
+          node.name = 'h2'
         end
 
         css('div.members > ol').each do |list|
           list.css('li').each do |li|
             h3 = doc.document.create_element 'h3'
+            h3['id'] = li['name'].rpartition('#').last unless li['name'].nil?
+
             li.prepend_child h3
             li.css('.shortcomment').remove
+
             modifier = li.at_css('.modifier_kind')
-            modifier.parent = h3 if modifier
+            modifier.parent = h3 unless modifier.nil?
+
+            kind = li.at_css('.modifier_kind .kind')
+            kind.content = kind.content + ' ' unless kind.nil?
+
             symbol = li.at_css('.symbol')
-            symbol.parent = h3 if symbol
+            symbol.parent = h3 unless symbol.nil?
+
             li.swap li.children
           end
+
           list.swap list.children
         end
 
-        pres = css('.fullcomment pre, .fullcommenttop pre')
-        pres.each do |pre|
+        css('.fullcomment pre, .fullcommenttop pre').each do |pre|
           pre['data-language'] = 'scala'
+          pre.content = pre.content
         end
-        pres.add_class 'language-scala'
-
-
-
-        doc
-
-      end
-
-      def root
-        css('#filter').remove # these are filters to search through the types and packages
-        css('#library').remove # these are icons at the top
-        doc
-      end
 
-      def other
-        # these are sections of the documentation which do not seem useful
+        # Sections of the documentation which do not seem useful
         %w(#inheritedMembers #groupedMembers .permalink .hiddenContent .material-icons).each do |selector|
           css(selector).remove
         end
 
-        # This is the kind of thing we have, class, object, trait
-        kind = at_css('.modifier_kind .kind').content
-        # this image replacement doesn't do anything on 2.12 docs
-        img = at_css('img')
-        img.replace %Q|<span class="img_kind">#{kind}</span>| unless img.nil?
-        class_to_add = kind == 'object' ? 'value': 'type'
+        # Things that are not shown on the site, like deprecated members
+        css('li[visbl=prt]').remove
+      end
+
+      def add_title
+        css('.permalink').remove
 
-        # for 2.10, 2.11, the kind class is associated to the body. we have to
-        # add it somewhere, so we do that with the #definition.
-        definition = css('#definition')
-        definition.css('.big_circle').remove
-        definition.add_class class_to_add
+        definition = at_css('#definition')
+        return if definition.nil?
 
-        # this is something that is not shown on the site, such as deprecated members
-        css('li[visbl=prt]').remove
+        type_full_name = {a: 'Annotation', c: 'Class', t: 'Trait', o: 'Object', p: 'Package'}
+        type = type_full_name[definition.at_css('.big-circle').text.to_sym]
+        name = CGI.escapeHTML definition.at_css('h1').text
 
-        doc
-      end
+        package = definition.at_css('#owner').text rescue ''
+        package = package + '.' unless name.empty? || package.empty?
 
-      private
+        other = definition.at_css('.morelinks').dup
+        other_content = other ? "<h3>#{other.to_html}</h3>" : ''
 
-      def change_tag!(new_tag, node)
-        node.replace %Q|<#{new_tag}>#{node.inner_html}</#{new_tag}>|
+        title_content = root_page? ? 'Package root' : "#{type} #{package}#{name}".strip
+        title = "<h1>#{title_content}</h1>"
+        definition.replace title + other_content
       end
     end
   end
diff --git a/lib/docs/filters/scala/entries.rb b/lib/docs/filters/scala/entries.rb
index d328764c..98eb9781 100644
--- a/lib/docs/filters/scala/entries.rb
+++ b/lib/docs/filters/scala/entries.rb
@@ -1,14 +1,30 @@
 module Docs
   class Scala
     class EntriesFilter < Docs::EntriesFilter
+      REPLACEMENTS = {
+        '$eq' => '=',
+        '$colon' => ':',
+        '$less' => '<',
+      }
+
       def get_name
-        # this first condition is mainly for scala 212 docs, which
-        # have their package listing as index.html
         if is_package?
           symbol = at_css('#definition h1')
           symbol ? symbol.text.gsub(/\W+/, '') : "package"
         else
-          slug.split('/').last
+          name = slug.split('/').last
+
+          # Some objects have inner objects, show ParentObject$.ChildObject$ instead of 
ParentObject$$ChildObject$
+          name = name.gsub('$$', '$.')
+
+          # If a dollar sign is used as separator between two characters, replace it with a dot
+          name = name.gsub(/([^$.])\$([^$.])/, '\1.\2')
+
+          REPLACEMENTS.each do |key, value|
+            name = name.gsub(key, value)
+          end
+
+          name
         end
       end
 
@@ -26,6 +42,31 @@ module Docs
         true
       end
 
+      def additional_entries
+        entries = []
+
+        full_name = "#{type}.#{name}".remove('$')
+        css(".members li[name^=\"#{full_name}\"]").each do |node|
+          # Ignore packages
+          kind = node.at_css('.modifier_kind > .kind')
+          next if !kind.nil? && kind.content == 'package'
+
+          # Ignore deprecated members
+          next unless node.at_css('.symbol > .name.deprecated').nil?
+
+          id = node['name'].rpartition('#').last
+          member_name = node.at_css('.name')
+
+          # Ignore members only existing of hashtags, we can't link to that
+          next if member_name.nil? || member_name.content.strip.remove('#').blank?
+
+          member = "#{name}.#{member_name.content}()"
+          entries << [member, id]
+        end
+
+        entries
+      end
+
       private
 
       # For the package name, we use the slug rather than parsing the package
@@ -40,7 +81,6 @@ module Docs
       end
 
       def parent_package
-        name = package_name
         parent = package_drop_last(package_name.split('.'))
         parent.empty? ? '_root_' : parent
       end
diff --git a/lib/docs/scrapers/scala.rb b/lib/docs/scrapers/scala.rb
index 6b6d6bb2..e831fa84 100644
--- a/lib/docs/scrapers/scala.rb
+++ b/lib/docs/scrapers/scala.rb
@@ -1,80 +1,60 @@
 module Docs
   class Scala < FileScraper
-    include FixInternalUrlsBehavior
-
-    self.name = 'scala'
+    self.name = 'Scala'
     self.type = 'scala'
     self.links = {
       home: 'http://www.scala-lang.org/',
       code: 'https://github.com/scala/scala'
     }
 
-    version '2.12 Library' do
-      self.release = '2.12.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Scala212/api/scala-library' # 
https://downloads.lightbend.com/scala/2.12.3/scala-docs-2.12.3.zip
-      self.base_url = 'http://www.scala-lang.org/api/2.12.3/'
+    options[:container] = '#content-container'
+    options[:attribution] = <<-HTML
+        &copy; 2002-2019 EPFL, with contributions from Lightbend.
+    HTML
+
+    # https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
+    # Extract api/scala-library into docs/scala~2.13_library
+    version '2.13 Library' do
+      self.release = '2.13.0'
+      self.base_url = 'https://www.scala-lang.org/api/2.13.0/'
       self.root_path = 'index.html'
-      options[:attribution] = <<-HTML
-        Scala programming documentation. Copyright (c) 2003-2017 <a
-        href="http://www.epfl.ch"; target="_blank">EPFL</a>, with contributions from <a
-        href="http://www.lightbend.com"; target="_blank">Lightbend</a>.
-      HTML
-      html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_212'
+
+      html_filters.push 'scala/entries', 'scala/clean_html'
     end
 
-    version '2.12 Reflection' do
-      self.release = '2.12.3'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Scala212/api/scala-reflect' # 
https://downloads.lightbend.com/scala/2.12.3/scala-docs-2.12.3.zip
-      self.base_url = 'http://www.scala-lang.org/api/2.12.3/scala-reflect/'
+    # https://downloads.lightbend.com/scala/2.13.0/scala-docs-2.13.0.zip
+    # Extract api/scala-reflect into docs/scala~2.13_reflection
+    version '2.13 Reflection' do
+      self.release = '2.13.0'
+      self.base_url = 'https://www.scala-lang.org/api/2.13.0/scala-reflect/'
       self.root_path = 'index.html'
-      options[:attribution] = <<-HTML
-        Scala programming documentation. Copyright (c) 2003-2017 <a
-        href="http://www.epfl.ch"; target="_blank">EPFL</a>, with contributions from <a
-        href="http://www.lightbend.com"; target="_blank">Lightbend</a>.
-      HTML
-      html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_212'
+
+      html_filters.push 'scala/entries', 'scala/clean_html'
     end
 
-    version '2.11 Library' do
-      self.release = '2.11.8'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Scala211/api/scala-library' # 
https://downloads.lightbend.com/scala/2.11.8/scala-docs-2.11.8.zip
-      self.base_url = 'http://www.scala-lang.org/api/2.11.8/'
-      self.root_path = 'package.html'
-      options[:skip_patterns] = [/^index.html/, /index\/index-/]
-      options[:attribution] = <<-HTML
-        Scala programming documentation. Copyright (c) 2003-2016 <a
-        href="http://www.epfl.ch"; target="_blank">EPFL</a>, with contributions from <a
-        href="http://www.lightbend.com"; target="_blank">Lightbend</a>.
-      HTML
-      html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_210'
+    # https://downloads.lightbend.com/scala/2.12.6/scala-docs-2.12.6.zip
+    # Extract api/scala-library into docs/scala~2.12_library
+    version '2.12 Library' do
+      self.release = '2.12.6'
+      self.base_url = 'https://www.scala-lang.org/api/2.12.6/'
+      self.root_path = 'index.html'
+
+      html_filters.push 'scala/entries', 'scala/clean_html'
     end
 
-    version '2.11 Reflection' do
-      self.release = '2.11.8'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Scala211/api/scala-reflect' # 
https://downloads.lightbend.com/scala/2.11.8/scala-docs-2.11.8.zip
-      self.base_url = 'http://www.scala-lang.org/api/2.11.8/scala-reflect/'
-      self.root_path = 'package.html'
-      options[:skip_patterns] = [/^index.html/, /index\/index-/]
-      options[:attribution] = <<-HTML
-        Scala programming documentation. Copyright (c) 2003-2016 <a
-        href="http://www.epfl.ch"; target="_blank">EPFL</a>, with contributions from <a
-        href="http://www.lightbend.com"; target="_blank">Lightbend</a>.
-      HTML
-      html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_210'
+    # https://downloads.lightbend.com/scala/2.12.6/scala-docs-2.12.6.zip
+    # Extract api/scala-reflect into docs/scala~2.12_reflection
+    version '2.12 Reflection' do
+      self.release = '2.12.6'
+      self.base_url = 'https://www.scala-lang.org/api/2.12.6/scala-reflect/'
+      self.root_path = 'index.html'
+
+      html_filters.push 'scala/entries', 'scala/clean_html'
     end
 
-    version '2.10' do
-      self.release = '2.10.6'
-      self.dir = '/Users/Thibaut/DevDocs/Docs/Scala210' # 
https://downloads.lightbend.com/scala/2.10.6/scala-docs-2.10.6.zip
-      self.base_url = 'http://www.scala-lang.org/api/2.10.6/'
-      self.root_path = 'package.html'
-      options[:skip_patterns] = [/^index.html/, /index\/index-/]
-      options[:attribution] = <<-HTML
-        Scala programming documentation. Copyright (c) 2003-2013 <a
-        href="http://www.epfl.ch"; target="_blank">EPFL</a>, with contributions from <a
-        href="http://typesafe.com"; target="_blank">Typesafe</a>.
-      HTML
-      html_filters.push 'scala/entries', 'scala/clean_html', 'scala/clean_html_210'
+    def get_latest_version(opts)
+      doc = fetch_doc('https://www.scala-lang.org/api/current/', opts)
+      doc.at_css('#doc-version').content
     end
   end
 end


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]