377 lines
13 KiB
Python
377 lines
13 KiB
Python
#!/usr/bin/env python
|
|
|
|
# Copyright (c) 2009-2020, Mario Vilas
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice,this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of the copyright holder nor the names of its
|
|
# contributors may be used to endorse or promote products derived from
|
|
# this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
import os
|
|
import random
|
|
import sys
|
|
import time
|
|
import ssl
|
|
|
|
if sys.version_info[0] > 2:
|
|
from http.cookiejar import LWPCookieJar
|
|
from urllib.request import Request, urlopen
|
|
from urllib.parse import quote_plus, urlparse, parse_qs
|
|
else:
|
|
from cookielib import LWPCookieJar
|
|
from urllib import quote_plus
|
|
from urllib2 import Request, urlopen
|
|
from urlparse import urlparse, parse_qs
|
|
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
is_bs4 = True
|
|
except ImportError:
|
|
from BeautifulSoup import BeautifulSoup
|
|
is_bs4 = False
|
|
|
|
__all__ = [
|
|
|
|
# Main search function.
|
|
'search',
|
|
|
|
# Shortcut for "get lucky" search.
|
|
'lucky',
|
|
|
|
# Miscellaneous utility functions.
|
|
'get_random_user_agent', 'get_tbs',
|
|
]
|
|
|
|
# URL templates to make Google searches.
|
|
url_home = "https://www.google.%(tld)s/"
|
|
url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
|
"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
|
|
"cr=%(country)s"
|
|
url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
|
"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \
|
|
"cr=%(country)s"
|
|
url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
|
"num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
|
|
"cr=%(country)s"
|
|
url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
|
|
"q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
|
|
"safe=%(safe)s&cr=%(country)s"
|
|
url_parameters = (
|
|
'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr')
|
|
|
|
# Cookie jar. Stored at the user's home folder.
|
|
# If the cookie jar is inaccessible, the errors are ignored.
|
|
home_folder = os.getenv('HOME')
|
|
if not home_folder:
|
|
home_folder = os.getenv('USERHOME')
|
|
if not home_folder:
|
|
home_folder = '.' # Use the current folder on error.
|
|
cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie'))
|
|
try:
|
|
cookie_jar.load()
|
|
except Exception:
|
|
pass
|
|
|
|
# Default user agent, unless instructed by the user to change it.
|
|
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
|
|
|
|
# Load the list of valid user agents from the install folder.
|
|
# The search order is:
|
|
# * user_agents.txt.gz
|
|
# * user_agents.txt
|
|
# * default user agent
|
|
try:
|
|
install_folder = os.path.abspath(os.path.split(__file__)[0])
|
|
try:
|
|
user_agents_file = os.path.join(install_folder, 'user_agents.txt.gz')
|
|
import gzip
|
|
fp = gzip.open(user_agents_file, 'rb')
|
|
try:
|
|
user_agents_list = [_.strip() for _ in fp.readlines()]
|
|
finally:
|
|
fp.close()
|
|
del fp
|
|
except Exception:
|
|
user_agents_file = os.path.join(install_folder, 'user_agents.txt')
|
|
with open(user_agents_file) as fp:
|
|
user_agents_list = [_.strip() for _ in fp.readlines()]
|
|
except Exception:
|
|
user_agents_list = [USER_AGENT]
|
|
|
|
|
|
# Get a random user agent.
|
|
def get_random_user_agent():
|
|
"""
|
|
Get a random user agent string.
|
|
|
|
:rtype: str
|
|
:return: Random user agent string.
|
|
"""
|
|
return random.choice(user_agents_list)
|
|
|
|
|
|
# Helper function to format the tbs parameter.
|
|
def get_tbs(from_date, to_date):
|
|
"""
|
|
Helper function to format the tbs parameter.
|
|
|
|
:param datetime.date from_date: Python date object.
|
|
:param datetime.date to_date: Python date object.
|
|
|
|
:rtype: str
|
|
:return: Dates encoded in tbs format.
|
|
"""
|
|
from_date = from_date.strftime('%m/%d/%Y')
|
|
to_date = to_date.strftime('%m/%d/%Y')
|
|
return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars()
|
|
|
|
|
|
# Request the given URL and return the response page, using the cookie jar.
|
|
# If the cookie jar is inaccessible, the errors are ignored.
|
|
def get_page(url, user_agent=None, verify_ssl=True):
|
|
"""
|
|
Request the given URL and return the response page, using the cookie jar.
|
|
|
|
:param str url: URL to retrieve.
|
|
:param str user_agent: User agent for the HTTP requests.
|
|
Use None for the default.
|
|
:param bool verify_ssl: Verify the SSL certificate to prevent
|
|
traffic interception attacks. Defaults to True.
|
|
|
|
:rtype: str
|
|
:return: Web page retrieved for the given URL.
|
|
|
|
:raises IOError: An exception is raised on error.
|
|
:raises urllib2.URLError: An exception is raised on error.
|
|
:raises urllib2.HTTPError: An exception is raised on error.
|
|
"""
|
|
if user_agent is None:
|
|
user_agent = USER_AGENT
|
|
request = Request(url)
|
|
request.add_header('User-Agent', user_agent)
|
|
cookie_jar.add_cookie_header(request)
|
|
if verify_ssl:
|
|
response = urlopen(request)
|
|
else:
|
|
context = ssl._create_unverified_context()
|
|
response = urlopen(request, context=context)
|
|
cookie_jar.extract_cookies(response, request)
|
|
html = response.read()
|
|
response.close()
|
|
try:
|
|
cookie_jar.save()
|
|
except Exception:
|
|
pass
|
|
return html
|
|
|
|
|
|
# Filter links found in the Google result pages HTML code.
|
|
# Returns None if the link doesn't yield a valid result.
|
|
def filter_result(link):
|
|
try:
|
|
|
|
# Decode hidden URLs.
|
|
if link.startswith('/url?'):
|
|
o = urlparse(link, 'http')
|
|
link = parse_qs(o.query)['q'][0]
|
|
|
|
# Valid results are absolute URLs not pointing to a Google domain,
|
|
# like images.google.com or googleusercontent.com for example.
|
|
# TODO this could be improved!
|
|
o = urlparse(link, 'http')
|
|
if o.netloc and 'google' not in o.netloc:
|
|
return link
|
|
|
|
# On error, return None.
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
# Returns a generator that yields URLs.
|
|
def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
|
|
stop=None, pause=2.0, country='', extra_params=None,
|
|
user_agent=None, verify_ssl=True):
|
|
"""
|
|
Search the given query string using Google.
|
|
|
|
:param str query: Query string. Must NOT be url-encoded.
|
|
:param str tld: Top level domain.
|
|
:param str lang: Language.
|
|
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
|
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
|
:param str safe: Safe search.
|
|
:param int num: Number of results per page.
|
|
:param int start: First result to retrieve.
|
|
:param int stop: Last result to retrieve.
|
|
Use None to keep searching forever.
|
|
:param float pause: Lapse to wait between HTTP requests.
|
|
A lapse too long will make the search slow, but a lapse too short may
|
|
cause Google to block your IP. Your mileage may vary!
|
|
:param str country: Country or region to focus the search on. Similar to
|
|
changing the TLD, but does not yield exactly the same results.
|
|
Only Google knows why...
|
|
:param dict extra_params: A dictionary of extra HTTP GET
|
|
parameters, which must be URL encoded. For example if you don't want
|
|
Google to filter similar results you can set the extra_params to
|
|
{'filter': '0'} which will append '&filter=0' to every query.
|
|
:param str user_agent: User agent for the HTTP requests.
|
|
Use None for the default.
|
|
:param bool verify_ssl: Verify the SSL certificate to prevent
|
|
traffic interception attacks. Defaults to True.
|
|
|
|
:rtype: generator of str
|
|
:return: Generator (iterator) that yields found URLs.
|
|
If the stop parameter is None the iterator will loop forever.
|
|
"""
|
|
# Set of hashes for the results found.
|
|
# This is used to avoid repeated results.
|
|
hashes = set()
|
|
|
|
# Count the number of links yielded.
|
|
count = 0
|
|
|
|
# Prepare the search string.
|
|
query = quote_plus(query)
|
|
|
|
# If no extra_params is given, create an empty dictionary.
|
|
# We should avoid using an empty dictionary as a default value
|
|
# in a function parameter in Python.
|
|
if not extra_params:
|
|
extra_params = {}
|
|
|
|
# Check extra_params for overlapping.
|
|
for builtin_param in url_parameters:
|
|
if builtin_param in extra_params.keys():
|
|
raise ValueError(
|
|
'GET parameter "%s" is overlapping with \
|
|
the built-in GET parameter',
|
|
builtin_param
|
|
)
|
|
|
|
# Grab the cookie from the home page.
|
|
get_page(url_home % vars(), user_agent, verify_ssl)
|
|
|
|
# Prepare the URL of the first request.
|
|
if start:
|
|
if num == 10:
|
|
url = url_next_page % vars()
|
|
else:
|
|
url = url_next_page_num % vars()
|
|
else:
|
|
if num == 10:
|
|
url = url_search % vars()
|
|
else:
|
|
url = url_search_num % vars()
|
|
|
|
# Loop until we reach the maximum result, if any (otherwise, loop forever).
|
|
while not stop or count < stop:
|
|
|
|
# Remeber last count to detect the end of results.
|
|
last_count = count
|
|
|
|
# Append extra GET parameters to the URL.
|
|
# This is done on every iteration because we're
|
|
# rebuilding the entire URL at the end of this loop.
|
|
for k, v in extra_params.items():
|
|
k = quote_plus(k)
|
|
v = quote_plus(v)
|
|
url = url + ('&%s=%s' % (k, v))
|
|
|
|
# Sleep between requests.
|
|
# Keeps Google from banning you for making too many requests.
|
|
time.sleep(pause)
|
|
|
|
# Request the Google Search results page.
|
|
html = get_page(url, user_agent, verify_ssl)
|
|
|
|
# Parse the response and get every anchored URL.
|
|
if is_bs4:
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
else:
|
|
soup = BeautifulSoup(html)
|
|
try:
|
|
anchors = soup.find(id='search').findAll('a')
|
|
# Sometimes (depending on the User-agent) there is
|
|
# no id "search" in html response...
|
|
except AttributeError:
|
|
# Remove links of the top bar.
|
|
gbar = soup.find(id='gbar')
|
|
if gbar:
|
|
gbar.clear()
|
|
anchors = soup.findAll('a')
|
|
|
|
# Process every anchored URL.
|
|
for a in anchors:
|
|
|
|
# Get the URL from the anchor tag.
|
|
try:
|
|
link = a['href']
|
|
except KeyError:
|
|
continue
|
|
|
|
# Filter invalid links and links pointing to Google itself.
|
|
link = filter_result(link)
|
|
if not link:
|
|
continue
|
|
|
|
# Discard repeated results.
|
|
h = hash(link)
|
|
if h in hashes:
|
|
continue
|
|
hashes.add(h)
|
|
|
|
# Yield the result.
|
|
yield link
|
|
|
|
# Increase the results counter.
|
|
# If we reached the limit, stop.
|
|
count += 1
|
|
if stop and count >= stop:
|
|
return
|
|
|
|
# End if there are no more results.
|
|
# XXX TODO review this logic, not sure if this is still true!
|
|
if last_count == count:
|
|
break
|
|
|
|
# Prepare the URL for the next request.
|
|
start += num
|
|
if num == 10:
|
|
url = url_next_page % vars()
|
|
else:
|
|
url = url_next_page_num % vars()
|
|
|
|
|
|
# Shortcut to single-item search.
|
|
# Evaluates the iterator to return the single URL as a string.
|
|
def lucky(*args, **kwargs):
|
|
"""
|
|
Shortcut to single-item search.
|
|
|
|
Same arguments as the main search function, but the return value changes.
|
|
|
|
:rtype: str
|
|
:return: URL found by Google.
|
|
"""
|
|
return next(search(*args, **kwargs))
|