[devdocsgjs/main: 4/76] Migrate c scraper from filescraper to urlscraper




commit fdfcf3d9174d7e386af42412c3a2394d6e1eafca
Author: Enoc <brianhernandez222 hotmail com>
Date:   Fri Sep 10 11:14:54 2021 -0600

    Migrate c scraper from filescraper to urlscraper

 assets/stylesheets/application.css.scss            |   2 +-
 assets/stylesheets/pages/{_c.scss => _cppref.scss} |   0
 lib/docs/filters/c/clean_html.rb                   | 116 ---------------------
 lib/docs/filters/c/entries.rb                      |   3 +
 lib/docs/filters/c/fix_code.rb                     |  21 ----
 lib/docs/filters/c/fix_urls.rb                     |  11 --
 lib/docs/scrapers/c.rb                             |  42 --------
 lib/docs/scrapers/cppref/c.rb                      |  12 +++
 lib/docs/scrapers/cppref/cpp.rb                    |   9 --
 lib/docs/scrapers/cppref/cppref.rb                 |  12 ++-
 10 files changed, 24 insertions(+), 204 deletions(-)
---
diff --git a/assets/stylesheets/application.css.scss b/assets/stylesheets/application.css.scss
index 0243afeb..542e1510 100644
--- a/assets/stylesheets/application.css.scss
+++ b/assets/stylesheets/application.css.scss
@@ -39,7 +39,7 @@
         'pages/async',
         'pages/bash',
         'pages/bootstrap',
-        'pages/c',
+        'pages/cppref',
         'pages/cakephp',
         'pages/clojure',
         'pages/codeception',
diff --git a/assets/stylesheets/pages/_c.scss b/assets/stylesheets/pages/_cppref.scss
similarity index 100%
rename from assets/stylesheets/pages/_c.scss
rename to assets/stylesheets/pages/_cppref.scss
diff --git a/lib/docs/filters/c/entries.rb b/lib/docs/filters/c/entries.rb
index 6c9f1565..63cfec61 100644
--- a/lib/docs/filters/c/entries.rb
+++ b/lib/docs/filters/c/entries.rb
@@ -22,6 +22,9 @@ module Docs
       end
 
       def get_type
+
+        return "C keywords" if slug =~ /keyword/
+
         type = at_css('.t-navbar > div:nth-child(4) > :first-child').try(:content)
         type.strip!
         type.remove! ' library'
diff --git a/lib/docs/scrapers/cppref/c.rb b/lib/docs/scrapers/cppref/c.rb
new file mode 100644
index 00000000..faa48fb3
--- /dev/null
+++ b/lib/docs/scrapers/cppref/c.rb
@@ -0,0 +1,12 @@
+module Docs
+  class C < Cppref
+    self.name = 'c'
+    self.slug = 'c'
+    self.base_url = 'https://en.cppreference.com/w/c/'
+
+    html_filters.insert_before 'cppref/clean_html', 'c/entries'
+
+    options[:root_title] = 'C Programming Language'
+
+  end
+end
diff --git a/lib/docs/scrapers/cppref/cpp.rb b/lib/docs/scrapers/cppref/cpp.rb
index bfc87c62..4f259729 100644
--- a/lib/docs/scrapers/cppref/cpp.rb
+++ b/lib/docs/scrapers/cppref/cpp.rb
@@ -2,7 +2,6 @@ module Docs
   class Cpp < Cppref
     self.name = 'C++'
     self.slug = 'cpp'
-    self.type = 'c'
     self.base_url = 'https://en.cppreference.com/w/cpp/'
 
     html_filters.insert_before 'cppref/clean_html', 'cpp/entries'
@@ -16,13 +15,5 @@ module Docs
       regex/regex_token_iterator/operator_cmp.html
     )
 
-    # Same as get_latest_version in lib/docs/scrapers/c.rb
-    def get_latest_version(opts)
-      doc = fetch_doc('https://en.cppreference.com/w/Cppreference:Archives', opts)
-      link = doc.at_css('a[title^="File:"]')
-      date = link.content.scan(/(\d+)\./)[0][0]
-      DateTime.strptime(date, '%Y%m%d').to_time.to_i
-    end
-
   end
 end
diff --git a/lib/docs/scrapers/cppref/cppref.rb b/lib/docs/scrapers/cppref/cppref.rb
index b91751ef..85bbc771 100644
--- a/lib/docs/scrapers/cppref/cppref.rb
+++ b/lib/docs/scrapers/cppref/cppref.rb
@@ -6,7 +6,7 @@ module Docs
 
     html_filters.insert_before 'clean_html', 'cppref/fix_code'
     html_filters.push  'cppref/clean_html', 'title'
-      # 'cpp20/entries',
+
     options[:decode_and_clean_paths] = true
     options[:container] = '#content'
     options[:title] = false
@@ -21,9 +21,13 @@ module Docs
       Licensed under the Creative Commons Attribution-ShareAlike Unported License v3.0.
     HTML
 
-    # def get_latest_version
-
-    # end
+    # Check if the 'headers' page has changed
+    def get_latest_version(opts)
+      doc = fetch_doc(self.base_url + self.root_path, opts)
+      date = doc.at_css('#footer-info-lastmod').content
+      date = date.match(/[[:digit:]]{1,2} .* [[:digit:]]{4}/).to_s
+      date = DateTime.strptime(date, '%e %B %Y').to_time.to_i
+    end
 
   end
 end


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]