summaryrefslogtreecommitdiff
path: root/utab/pyfav/pyfav.py
diff options
context:
space:
mode:
Diffstat (limited to 'utab/pyfav/pyfav.py')
-rw-r--r--utab/pyfav/pyfav.py220
1 files changed, 220 insertions, 0 deletions
diff --git a/utab/pyfav/pyfav.py b/utab/pyfav/pyfav.py
new file mode 100644
index 0000000..34b4f93
--- /dev/null
+++ b/utab/pyfav/pyfav.py
@@ -0,0 +1,220 @@
+"""
+MIT License
+Copyright (c) 2013 Matthew Phillips
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+"""
+This is PyFav. It's a python package that helps you download favicons.
+
+Find the project on GitHub at https://github.com/phillipsm/pyfav
+and in PyPI at http://python.org/pypi/pyfav
+
+
+
+The simplest way to get started is to use the download_favicon function.
+
+To download a favicon for it's as simple as,
+
+============
+from favicon import download_favicon
+
+download_favicon('https://www.python.org/')
+============
+
+If you want to be specific in where that favicon gets written to disk,
+
+============
+favicon_saved_at = download_favicon('https://www.python.org/', \
+ file_prefix='python.org-', target_dir='/tmp/favicon-downloads')
+============
+
+
+If you'd prefer to handle the write to disk piece, use the get_favicon_url
+function by itself,
+============
+favicon_url = get_favicon_url('https://www.python.org/')
+============
+"""
+
+
+import urllib, os.path, string, re
+import requests
+from bs4 import BeautifulSoup
+
+
+# Some hosts don't like the requests default UA. Use this one instead.
+headers = {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) \
+ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 \
+ Safari/537.36"
+}
+
+
+def download_favicon(url, file_prefix="", target_dir="/tmp"):
+ """
+ Given a URL download the save it to disk
+
+ Keyword arguments:
+ url -- A string. This is the location of the favicon
+ file_prefix - A string. If you want the downloaded favicon filename to
+ be start with some characters you provide, this is a good way to do it.
+ target_dir -- The location where the favicon will be saved.
+
+ Returns:
+ The file location of the downloaded favicon. A string.
+ """
+
+ parsed_site_uri = urllib.parse.urlparse(url)
+
+ # Help the user out if they didn't give us a protocol
+ if not parsed_site_uri.scheme:
+ url = "http://" + url
+ parsed_site_uri = urllib.parse.urlparse(url)
+
+ if not parsed_site_uri.scheme or not parsed_site_uri.netloc:
+ raise Exception("Unable to parse URL, %s" % url)
+
+ favicon_url = get_favicon_url(url)
+
+ if not favicon_url:
+ raise Exception("Unable to find favicon for, %s" % url)
+
+ # We finally have a URL for our favicon. Get it.
+ response = requests.get(favicon_url, headers=headers)
+ if response.status_code == requests.codes.ok:
+ # we want to get the the filename from the url without any params
+ parsed_uri = urllib.parse.urlparse(favicon_url)
+ favicon_filepath = parsed_uri.path
+ favicon_path, favicon_filename = os.path.split(favicon_filepath)
+
+ valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
+
+ sanitized_filename = "".join([x if valid_chars else "" for x in favicon_filename])
+
+ sanitized_filename = os.path.join(target_dir, file_prefix + sanitized_filename)
+
+ with open(sanitized_filename, "wb") as f:
+ for chunk in response.iter_content(chunk_size=1024):
+ if chunk: # filter out keep-alive new chunks
+ f.write(chunk)
+ f.flush()
+
+ return sanitized_filename
+
+
+def parse_markup_for_favicon(markup, url):
+ """
+ Given markup, parse it for a favicon URL. The favicon URL is adjusted
+ so that it can be retrieved independently. If no favicon is found in the
+ markup we return None.
+
+ Keyword arguments:
+ markup -- A string containing the HTML markup.
+ url -- A string containing the URL where the supplied markup can be found.
+ We use this URL in cases where the favicon path in the markup is relative.
+
+ Retruns:
+ The URL of the favicon. A string. If not found, returns None.
+ """
+
+ parsed_site_uri = urllib.parse.urlparse(url)
+
+ soup = BeautifulSoup(markup)
+
+ # Do we have a link element with the icon?
+ icon_link = soup.find("link", rel="icon")
+ if icon_link and icon_link.has_attr("href"):
+
+ favicon_url = icon_link["href"]
+
+ # Sometimes we get a protocol-relative path
+ if favicon_url.startswith("//"):
+ parsed_uri = urllib.parse.urlparse(url)
+ favicon_url = parsed_uri.scheme + ":" + favicon_url
+
+ # An absolute path relative to the domain
+ elif favicon_url.startswith("/"):
+ favicon_url = (
+ parsed_site_uri.scheme + "://" + parsed_site_uri.netloc + favicon_url
+ )
+
+ elif re.match("^data:\w+/\w+;base64,[A-Za-z0-9=/]+", favicon_url):
+ pass # base64; return verbatim
+
+ # A relative path favicon
+ elif not favicon_url.startswith("http"):
+ path, filename = os.path.split(parsed_site_uri.path)
+ favicon_url = (
+ parsed_site_uri.scheme
+ + "://"
+ + parsed_site_uri.netloc
+ + "/"
+ + os.path.join(path, favicon_url)
+ )
+
+ # We found a favicon in the markup and we've formatted the URL
+ # so that it can be loaded independently of the rest of the page
+ return favicon_url
+
+ # No favicon in the markup
+ return None
+
+
+def get_favicon_url(url):
+ """
+ Returns a favicon URL for the URL passed in. We look in the markup returned
+ from the URL first and if we don't find a favicon there, we look for the
+ default location, e.g., http://example.com/favicon.ico . We retrurn None if
+ unable to find the file.
+
+ Keyword arguments:
+ url -- A string. This is the URL that we'll find a favicon for.
+
+ Returns:
+ The URL of the favicon. A string. If not found, returns None.
+ """
+
+ parsed_site_uri = urllib.parse.urlparse(url)
+
+ # Get the markup
+ try:
+ response = requests.get(url, headers=headers)
+ except:
+ raise Exception("Unable to find URL. Is it valid? %s" % url)
+
+ if response.status_code == requests.codes.ok:
+ favicon_url = parse_markup_for_favicon(response.content, url)
+
+ # We found a favicon in our markup. Return the URL
+ if favicon_url:
+ return favicon_url
+
+ # The favicon doesn't appear to be in the makrup
+ # Let's look at the common locaiton, url/favicon.ico
+ favicon_url = "{uri.scheme}://{uri.netloc}/favicon.ico".format(uri=parsed_site_uri)
+
+ response = requests.get(favicon_url, headers=headers)
+ if response.status_code == requests.codes.ok:
+ return favicon_url
+
+ # No favicon in the markup or at url/favicon.ico
+ return None