[devdocsgjs/main: 1683/1867] Reindex R documentation, include 2 manuals

From: Andy Holmes <andyholmes src gnome org>
To: commits-list gnome org
Cc:
Subject: [devdocsgjs/main: 1683/1867] Reindex R documentation, include 2 manuals
Date: Fri, 19 Nov 2021 23:48:14 +0000 (UTC)

commit 94b404450cb7af46a904b29003a58bdbe19a6eb4
Author: Cimbali <me cimba li>
Date:   Thu May 27 16:41:32 2021 +0200

    Reindex R documentation, include 2 manuals
    
    Now each page is indexed by their title (by default), and each index
    term declared for it on the index
    
    2 manuals are included, the data import/export as its own category (as
    it is rather short), and each top-level section of the R introduction
    manual (as it is quite a bit longer).
    
    Add some manual cleanup.
    
    Some pages still seem missing:
    - either belonging to non-default packages, i.e. it is normal that they miss
    - or corresponding to index words without their own package (!)

 lib/docs/filters/r/clean_html.rb | 35 ++++++++++++++++++++++++----
 lib/docs/filters/r/entries.rb    | 50 +++++++++++++++++++++++++++++-----------
 lib/docs/scrapers/r.rb           | 23 ++++++++++++++++++
 3 files changed, 89 insertions(+), 19 deletions(-)
---
diff --git a/lib/docs/filters/r/clean_html.rb b/lib/docs/filters/r/clean_html.rb
index 28ea571d..57c91ee5 100644
--- a/lib/docs/filters/r/clean_html.rb
+++ b/lib/docs/filters/r/clean_html.rb
@@ -3,7 +3,13 @@ module Docs
     class CleanHtmlFilter < Filter
       def call
         slug_parts = slug.split('/')
-        if slug_parts[0] == 'library'
+
+        if root_page?
+          css('a[href$="/00index"]').each do |pkg|
+            pkg['href'] = "/r-#{pkg['href'].split('/')[1]}/"
+          end
+
+        elsif slug_parts[0] == 'library'
           title = at_css('h2')
           title.inner_html = "<code>#{slug_parts[3]}</code> #{title.content}"
 
@@ -11,12 +17,31 @@ module Docs
           summary.remove if summary
 
         elsif slug_parts[-2] == 'manual'
+          css('table.menu, div.header, hr, h2.contents-heading, div.contents, table.index-cp, 
table.index-vr, table[summary]').remove
+
+          css('h2').each do |node|
+            node.remove if node.content.end_with? ' index'
+          end
+
           css('span[id] + h1, span[id] + h2, span[id] + h3, span[id] + h4, span[id] + h5, span[id] + 
h6').each do |node|
-            id = node.previous['id']
-            node.previous.remove
-            node['id'] = id.sub(/-1$/, '') if id
+            # We need the first of the series of span with ids
+            span = node.previous_element
+            while span.previous
+              prev = span.previous_element
+              break unless prev.name == 'span' and prev['id']
+              span.remove
+              span = prev
+            end
+
+            node['id'] = span['id']
+            span.remove
+
+            css('div.example').each do |node|
+              node.replace(node.children)
+            end
           end
-          css('table.menu, div.header, hr').remove
+
+          css('h1 + h1').remove
 
           css('.footnote h5').each do |node|
             anchor = node.at_css('a[id]')
diff --git a/lib/docs/filters/r/entries.rb b/lib/docs/filters/r/entries.rb
index b54c2c21..a9793e07 100644
--- a/lib/docs/filters/r/entries.rb
+++ b/lib/docs/filters/r/entries.rb
@@ -2,11 +2,16 @@ module Docs
   class R
     class EntriesFilter < Docs::EntriesFilter
 
-      @@include_manual = false
-      @@include_misc = false
+      PKG_INDEX_ENTRIES = Hash.new []
 
       def initialize(*)
         super
+
+        if slug_parts[-1] == '00Index'
+          css('tr a').each do |link|
+            PKG_INDEX_ENTRIES[link['href']] += [link.text]
+          end
+        end
       end
 
       def slug_parts
@@ -18,11 +23,11 @@ module Docs
       end
 
       def is_manual?
-        slug_parts[-2] == 'manual'
+        slug_parts[1] == 'manual'
       end
 
       def get_name
-        return slug_parts[3] + ' − ' + at_css('h2').content if is_package?
+        return at_css('h2').content if is_package?
         title = at_css('h1.settitle')
         title ? title.content : at_css('h1, h2').content
       end
@@ -30,24 +35,41 @@ module Docs
       def get_type
         return slug_parts[1] if is_package?
         return at_css('h1.settitle').content if is_manual?
-        'Miscellaneous'
       end
 
       def include_default_entry?
-        if is_manual? or slug_parts[-1] == '00Index' or slug_parts[-1] == 'index'
-          return false
-        end
-        is_package? or self.include_misc
+        is_package? and not slug_parts[-1] == '00Index'
+      end
+
+      def manual_section(node)
+        title = node.content.sub /^((Appendix )?[A-Z]|[0-9]+)(\.[0-9]+)* /, ''
+        title unless ['References', 'Preface', 'Acknowledgements'].include?(title) or title.end_with?(' 
index')
       end
 
       def additional_entries
-        return [] unless is_manual? and self.include_manual
+        if is_package? and slug_parts[-1] != '00Index'
+          page = slug_parts[-1]
+          return [page] + PKG_INDEX_ENTRIES.fetch(page, [])
+        end
+
+        return [] unless is_manual?
 
         entries = []
-        css('div.contents > ul > li').each do |node|
-          node.css('a').each do |link|
-            link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, ''
-            entries << [link_name, link['href'].split('#')[1], name]
+        unless slug_parts[-1].downcase == 'r-intro'
+          # Single top-level category
+          css('div.contents > ul a').each do |link|
+            link_name = manual_section(link)
+            entries << [link_name, link['href'].split('#')[1], name] unless link_name.nil?
+          end
+        else
+          # Split 1st level of manual into different categories
+          css('div.contents > ul > li').each do |node|
+            type = manual_section(node.at_css('a'))
+            next if type.nil?
+            node.css('> ul a').each do |link|
+              link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, ''
+              entries << [link_name, link['href'].split('#')[1], type]
+            end
           end
         end
         return entries
diff --git a/lib/docs/scrapers/r.rb b/lib/docs/scrapers/r.rb
index 9d95fbaa..6a36a843 100644
--- a/lib/docs/scrapers/r.rb
+++ b/lib/docs/scrapers/r.rb
@@ -21,10 +21,33 @@ module Docs
     HTML
 
     # Never want those
+    options[:skip_patterns] = [
+      /\/DESCRIPTION$/,
+      /\/NEWS(\.[^\/]*)?$/,
+      /\/demo$/,
+      /\.pdf$/
+    ]
+
+    ## We want to fix links like so − but only if the targets don’t exist,
+    ## as these target packages or keywords that do not have their own file,
+    ## but exist on another page, and we properly record it.
+    #
+    #options[:fix_urls] = ->(url) do
+    #  url.sub!(%r'/library/([^/]+)/doc/index.html$') { |m| "/r-#{$1.parameterize.downcase}/" }
+    #  url.sub!(%r'/library/([^/]+)/html/([^/]+).html$') { |m| 
"/library/#{$1.parameterize.downcase}/html/#{$2.parameterize.downcase}" }
+    #end
+
     options[:skip] = %w(
       doc/html/packages-head-utf8.html
       doc/html/SearchOn.html
       doc/html/Search.html
+      doc/html/UserManuals.html
+      doc/html/faq.html
+      doc/manual/R-FAQ.html
+      doc/manual/R-admin.html
+      doc/manual/R-exts.html
+      doc/manual/R-ints.html
+      doc/manual/R-lang.html
     )
 
   end
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]