[kupfer] plugin.text: Improve fuzzy URL parsing



commit d49ced9c5feffda38954290479f6cd4957aeef7e
Author: Ulrik Sverdrup <ulrik sverdrup gmail com>
Date:   Wed Sep 2 15:25:12 2009 +0200

    plugin.text: Improve fuzzy URL parsing
    
    Check that so that we don't parse numbers as URLs (3.14) etc; check
    for at least some letters, require all parts to start with
    alphanumerics (so that =sin(2.2) is not parsed as URL)

 kupfer/plugin/text.py |    8 +++++---
 1 files changed, 5 insertions(+), 3 deletions(-)
---
diff --git a/kupfer/plugin/text.py b/kupfer/plugin/text.py
index 2f2ab34..309e65b 100644
--- a/kupfer/plugin/text.py
+++ b/kupfer/plugin/text.py
@@ -51,15 +51,17 @@ class URLTextSource (TextSource):
 	def get_rank(self):
 		return 75
 	def get_items(self, text):
-		""" A bit of hackery to recognize URLs and web addresses
-		alike"""
 		text = text.strip()
 		components = list(urlparse(text))
 		domain = "".join(components[1:])
 		dotparts = domain.rsplit(".")
 
+		# 1. Domain name part is one word (without spaces)
+		# 2. Urlparse parses a scheme (http://), else we apply heuristics
 		if len(domain.split()) == 1 and (components[0] or ("." in domain and
-			len(dotparts) >= 2 and len(dotparts[-1]) >= 2)):
+			len(dotparts) >= 2 and len(dotparts[-1]) >= 2 and
+			any(char.isalpha() for char in domain) and
+			all(part[:1].isalnum() for part in dotparts))):
 			if not components[0]:
 				url = "http://"; + "".join(components[1:])
 			else:



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]