[devdocsgjs/main: 809/1867] mariadb: update scraper and filters to work with the official website
- From: Andy Holmes <andyholmes src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [devdocsgjs/main: 809/1867] mariadb: update scraper and filters to work with the official website
- Date: Fri, 19 Nov 2021 23:47:30 +0000 (UTC)
commit 93582d3b8268366e7c831573b8989f1805174a01
Author: Jasper van Merle <jaspervmerle gmail com>
Date: Tue Aug 13 22:55:05 2019 +0200
mariadb: update scraper and filters to work with the official website
.../javascripts/templates/pages/about_tmpl.coffee | 2 +-
assets/stylesheets/pages/_mariadb.scss | 2 +-
lib/docs/filters/mariadb/clean_html.rb | 41 +++++++++-------------
lib/docs/filters/mariadb/entries.rb | 16 +++++++--
lib/docs/filters/mariadb/erase_invalid_pages.rb | 34 ++++++++++++++++++
lib/docs/scrapers/mariadb.rb | 25 +++++++++----
6 files changed, 84 insertions(+), 36 deletions(-)
---
diff --git a/assets/javascripts/templates/pages/about_tmpl.coffee
b/assets/javascripts/templates/pages/about_tmpl.coffee
index 98775c57..f00ff2df 100644
--- a/assets/javascripts/templates/pages/about_tmpl.coffee
+++ b/assets/javascripts/templates/pages/about_tmpl.coffee
@@ -439,7 +439,7 @@ credits = [
'http://www.gnu.org/copyleft/fdl.html'
], [
'MariaDB',
- '2018 MariaDB',
+ '2019 MariaDB',
'CC BY-SA & GFDL',
'https://mariadb.com/kb/en/library/documentation/+license/'
], [
diff --git a/assets/stylesheets/pages/_mariadb.scss b/assets/stylesheets/pages/_mariadb.scss
index ef6144ce..19d8d639 100644
--- a/assets/stylesheets/pages/_mariadb.scss
+++ b/assets/stylesheets/pages/_mariadb.scss
@@ -1,7 +1,7 @@
._mariadb {
@extend %simple;
- .graybox {
+ .graybox, .product {
@extend %note;
}
}
diff --git a/lib/docs/filters/mariadb/clean_html.rb b/lib/docs/filters/mariadb/clean_html.rb
index 88ab4fc5..86b6dc00 100644
--- a/lib/docs/filters/mariadb/clean_html.rb
+++ b/lib/docs/filters/mariadb/clean_html.rb
@@ -1,11 +1,10 @@
-require 'net/http'
-
module Docs
class Mariadb
class CleanHtmlFilter < Filter
- @@known_urls = Hash.new
-
def call
+ # Return the empty doc if the EraseInvalidPagesFilter detected this page shouldn't be scraped
+ return doc if doc.inner_html == ''
+
# Extract main content
@doc = at_css('#content')
@@ -21,19 +20,6 @@ module Docs
node['data-language'] = 'sql'
end
- # Fix links like http://kb-mirror.mariadb.com/kb/en/bitwise-or/ to not redirect to an external page
- css('a').each do |node|
- url = node['href']
-
- if /^http:\/\/kb-mirror\.mariadb\.com\/kb\/en\/[^\/]+\/(#[^\/]+)?$/.match?(url)
- final_url = get_final_url(url)
-
- if !final_url.nil? && final_url.start_with?('/kb/en/library/documentation/')
- node['href'] = "#{'../' * subpath.count('/')}#{final_url[29..-1]}index"
- end
- end
- end
-
# Fix images
css('img').each do |node|
node['src'] = node['src'].sub('http:', 'https:')
@@ -46,11 +32,11 @@ module Docs
end
end
- # Convert listings (pages like
http://kb-mirror.mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables
+ # Convert listings (pages like
https://mariadb.com/kb/en/library/documentation/sql-statements-structure/) into tables
css('ul.listing').each do |node|
rows = []
- node.css('li').each do |li|
+ node.css('li:not(.no_data)').each do |li|
name = li.at_css('.media-heading').content
description = li.at_css('.blurb').content
url = li.at_css('a')['href']
@@ -61,15 +47,20 @@ module Docs
node.replace(table)
end
- doc
- end
+ # Turn note titles into <strong> tags
+ css('.product_title').each do |node|
+ node.name = 'strong'
+ end
- def get_final_url(url)
- unless @@known_urls.has_key?(url)
- @@known_urls[url] = Net::HTTP.get_response(URI(url))['location']
+ # Remove comments and questions
+ css('.related_questions, #comments').remove
+ css('h2').each do |node|
+ if node.content == 'Comments'
+ node.remove
+ end
end
- @@known_urls[url]
+ doc
end
end
end
diff --git a/lib/docs/filters/mariadb/entries.rb b/lib/docs/filters/mariadb/entries.rb
index 32d4f6b2..a3cbeb34 100644
--- a/lib/docs/filters/mariadb/entries.rb
+++ b/lib/docs/filters/mariadb/entries.rb
@@ -2,12 +2,22 @@ module Docs
class Mariadb
class EntriesFilter < Docs::EntriesFilter
def get_name
- at_css('.container > h1').content.strip
+ return 'Name' if doc.inner_html == ''
+
+ at_css('#content > h1').content.strip
end
def get_type
- link = at_css('#breadcrumbs > a:nth-child(6)')
- link.nil? ? at_css('#breadcrumbs > a:nth-child(5)').content : link.content
+ return 'Type' if doc.inner_html == ''
+
+ link = at_css('#breadcrumbs > a:nth-child(4)')
+ link.nil? ? at_css('#breadcrumbs > a:nth-child(3)').content : link.content
+ end
+
+ def entries
+ # Don't add an entry for this page if the EraseInvalidPagesFilter detected this page shouldn't be
scraped
+ return [] if doc.inner_html == ''
+ super
end
end
end
diff --git a/lib/docs/filters/mariadb/erase_invalid_pages.rb b/lib/docs/filters/mariadb/erase_invalid_pages.rb
new file mode 100644
index 00000000..0987375d
--- /dev/null
+++ b/lib/docs/filters/mariadb/erase_invalid_pages.rb
@@ -0,0 +1,34 @@
+module Docs
+ class Mariadb
+ class EraseInvalidPagesFilter < Filter
+ @@seen_urls = Hash.new
+
+ def call
+ # The MariaDB documentation uses urls like mariadb.com/kb/en/*
+ # This means there is no way to detect if a page should be scraped based on it's url
+ # We run this filter before the internal_urls filter scrapes all internal urls
+ # If this page should not be scraped, we erase it's contents in here so that the internal urls are
not picked up
+ # The entries filter will make sure that no entry is saved for this page
+
+ if at_css('a.crumb[href="https://mariadb.com/kb/en/documentation/"]').nil?
+ doc.inner_html = ''
+ end
+
+ current_page = at_css('a.crumb.node_link')
+ unless current_page.nil?
+ url = current_page['href']
+
+ # Some links lead to the same page
+ # Only parse the page one time
+ if @@seen_urls.has_key?(url)
+ doc.inner_html = ''
+ end
+
+ @@seen_urls[url] = true
+ end
+
+ doc
+ end
+ end
+ end
+end
diff --git a/lib/docs/scrapers/mariadb.rb b/lib/docs/scrapers/mariadb.rb
index b4293a9a..859b6bec 100644
--- a/lib/docs/scrapers/mariadb.rb
+++ b/lib/docs/scrapers/mariadb.rb
@@ -2,21 +2,34 @@ module Docs
class Mariadb < UrlScraper
self.name = 'MariaDB'
self.type = 'mariadb'
- self.release = '10.3.8'
- self.base_url = 'http://kb-mirror.mariadb.com/kb/en/library/documentation/'
+ self.release = '10.4.7'
+ self.base_url = 'https://mariadb.com/kb/en/'
+ self.root_path = 'library/documentation/'
self.links = {
home: 'https://mariadb.com/',
code: 'https://github.com/MariaDB/server'
}
- html_filters.push 'mariadb/entries', 'mariadb/clean_html', 'title'
+ html_filters.insert_before 'internal_urls', 'mariadb/erase_invalid_pages'
+ html_filters.push 'mariadb/entries', 'mariadb/clean_html'
- options[:download_images] = false
- options[:root_title] = 'MariaDB'
+ options[:skip_patterns] = [
+ /\+/,
+ /\/ask\//,
+ /-release-notes\//,
+ /-changelog\//,
+ /^documentation\//,
+ /^mariadb-server-documentation\//,
+ ]
options[:attribution] = <<-HTML
- © 2018 MariaDB<br>
+ © 2019 MariaDB<br>
Licensed under the Creative Commons Attribution 3.0 Unported License and the GNU Free Documentation
License.
HTML
+
+ def get_latest_version(opts)
+ doc = fetch_doc('https://mariadb.com/downloads/', opts)
+ doc.at_css('[data-version-id="mariadb_server-versions"] option').content.split('-')[0]
+ end
end
end
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]