asd
This commit is contained in:
376
venv/lib/python3.12/site-packages/googlesearch/__init__.py
Normal file
376
venv/lib/python3.12/site-packages/googlesearch/__init__.py
Normal file
@ -0,0 +1,376 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright (c) 2009-2020, Mario Vilas
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice,this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
import ssl
|
||||
|
||||
if sys.version_info[0] > 2:
|
||||
from http.cookiejar import LWPCookieJar
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.parse import quote_plus, urlparse, parse_qs
|
||||
else:
|
||||
from cookielib import LWPCookieJar
|
||||
from urllib import quote_plus
|
||||
from urllib2 import Request, urlopen
|
||||
from urlparse import urlparse, parse_qs
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
is_bs4 = True
|
||||
except ImportError:
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
is_bs4 = False
|
||||
|
||||
__all__ = [
|
||||
|
||||
# Main search function.
|
||||
'search',
|
||||
|
||||
# Shortcut for "get lucky" search.
|
||||
'lucky',
|
||||
|
||||
# Miscellaneous utility functions.
|
||||
'get_random_user_agent', 'get_tbs',
|
||||
]
|
||||
|
||||
# URL templates to make Google searches.
|
||||
url_home = "https://www.google.%(tld)s/"
|
||||
url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
||||
"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
|
||||
"cr=%(country)s"
|
||||
url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
||||
"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \
|
||||
"cr=%(country)s"
|
||||
url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
||||
"num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
|
||||
"cr=%(country)s"
|
||||
url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
|
||||
"q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
|
||||
"safe=%(safe)s&cr=%(country)s"
|
||||
url_parameters = (
|
||||
'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr')
|
||||
|
||||
# Cookie jar. Stored at the user's home folder.
|
||||
# If the cookie jar is inaccessible, the errors are ignored.
|
||||
home_folder = os.getenv('HOME')
|
||||
if not home_folder:
|
||||
home_folder = os.getenv('USERHOME')
|
||||
if not home_folder:
|
||||
home_folder = '.' # Use the current folder on error.
|
||||
cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie'))
|
||||
try:
|
||||
cookie_jar.load()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Default user agent, unless instructed by the user to change it.
|
||||
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
|
||||
|
||||
# Load the list of valid user agents from the install folder.
|
||||
# The search order is:
|
||||
# * user_agents.txt.gz
|
||||
# * user_agents.txt
|
||||
# * default user agent
|
||||
try:
|
||||
install_folder = os.path.abspath(os.path.split(__file__)[0])
|
||||
try:
|
||||
user_agents_file = os.path.join(install_folder, 'user_agents.txt.gz')
|
||||
import gzip
|
||||
fp = gzip.open(user_agents_file, 'rb')
|
||||
try:
|
||||
user_agents_list = [_.strip() for _ in fp.readlines()]
|
||||
finally:
|
||||
fp.close()
|
||||
del fp
|
||||
except Exception:
|
||||
user_agents_file = os.path.join(install_folder, 'user_agents.txt')
|
||||
with open(user_agents_file) as fp:
|
||||
user_agents_list = [_.strip() for _ in fp.readlines()]
|
||||
except Exception:
|
||||
user_agents_list = [USER_AGENT]
|
||||
|
||||
|
||||
# Get a random user agent.
|
||||
def get_random_user_agent():
|
||||
"""
|
||||
Get a random user agent string.
|
||||
|
||||
:rtype: str
|
||||
:return: Random user agent string.
|
||||
"""
|
||||
return random.choice(user_agents_list)
|
||||
|
||||
|
||||
# Helper function to format the tbs parameter.
|
||||
def get_tbs(from_date, to_date):
|
||||
"""
|
||||
Helper function to format the tbs parameter.
|
||||
|
||||
:param datetime.date from_date: Python date object.
|
||||
:param datetime.date to_date: Python date object.
|
||||
|
||||
:rtype: str
|
||||
:return: Dates encoded in tbs format.
|
||||
"""
|
||||
from_date = from_date.strftime('%m/%d/%Y')
|
||||
to_date = to_date.strftime('%m/%d/%Y')
|
||||
return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars()
|
||||
|
||||
|
||||
# Request the given URL and return the response page, using the cookie jar.
|
||||
# If the cookie jar is inaccessible, the errors are ignored.
|
||||
def get_page(url, user_agent=None, verify_ssl=True):
|
||||
"""
|
||||
Request the given URL and return the response page, using the cookie jar.
|
||||
|
||||
:param str url: URL to retrieve.
|
||||
:param str user_agent: User agent for the HTTP requests.
|
||||
Use None for the default.
|
||||
:param bool verify_ssl: Verify the SSL certificate to prevent
|
||||
traffic interception attacks. Defaults to True.
|
||||
|
||||
:rtype: str
|
||||
:return: Web page retrieved for the given URL.
|
||||
|
||||
:raises IOError: An exception is raised on error.
|
||||
:raises urllib2.URLError: An exception is raised on error.
|
||||
:raises urllib2.HTTPError: An exception is raised on error.
|
||||
"""
|
||||
if user_agent is None:
|
||||
user_agent = USER_AGENT
|
||||
request = Request(url)
|
||||
request.add_header('User-Agent', user_agent)
|
||||
cookie_jar.add_cookie_header(request)
|
||||
if verify_ssl:
|
||||
response = urlopen(request)
|
||||
else:
|
||||
context = ssl._create_unverified_context()
|
||||
response = urlopen(request, context=context)
|
||||
cookie_jar.extract_cookies(response, request)
|
||||
html = response.read()
|
||||
response.close()
|
||||
try:
|
||||
cookie_jar.save()
|
||||
except Exception:
|
||||
pass
|
||||
return html
|
||||
|
||||
|
||||
# Filter links found in the Google result pages HTML code.
|
||||
# Returns None if the link doesn't yield a valid result.
|
||||
def filter_result(link):
|
||||
try:
|
||||
|
||||
# Decode hidden URLs.
|
||||
if link.startswith('/url?'):
|
||||
o = urlparse(link, 'http')
|
||||
link = parse_qs(o.query)['q'][0]
|
||||
|
||||
# Valid results are absolute URLs not pointing to a Google domain,
|
||||
# like images.google.com or googleusercontent.com for example.
|
||||
# TODO this could be improved!
|
||||
o = urlparse(link, 'http')
|
||||
if o.netloc and 'google' not in o.netloc:
|
||||
return link
|
||||
|
||||
# On error, return None.
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# Returns a generator that yields URLs.
|
||||
def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
|
||||
stop=None, pause=2.0, country='', extra_params=None,
|
||||
user_agent=None, verify_ssl=True):
|
||||
"""
|
||||
Search the given query string using Google.
|
||||
|
||||
:param str query: Query string. Must NOT be url-encoded.
|
||||
:param str tld: Top level domain.
|
||||
:param str lang: Language.
|
||||
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
||||
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
||||
:param str safe: Safe search.
|
||||
:param int num: Number of results per page.
|
||||
:param int start: First result to retrieve.
|
||||
:param int stop: Last result to retrieve.
|
||||
Use None to keep searching forever.
|
||||
:param float pause: Lapse to wait between HTTP requests.
|
||||
A lapse too long will make the search slow, but a lapse too short may
|
||||
cause Google to block your IP. Your mileage may vary!
|
||||
:param str country: Country or region to focus the search on. Similar to
|
||||
changing the TLD, but does not yield exactly the same results.
|
||||
Only Google knows why...
|
||||
:param dict extra_params: A dictionary of extra HTTP GET
|
||||
parameters, which must be URL encoded. For example if you don't want
|
||||
Google to filter similar results you can set the extra_params to
|
||||
{'filter': '0'} which will append '&filter=0' to every query.
|
||||
:param str user_agent: User agent for the HTTP requests.
|
||||
Use None for the default.
|
||||
:param bool verify_ssl: Verify the SSL certificate to prevent
|
||||
traffic interception attacks. Defaults to True.
|
||||
|
||||
:rtype: generator of str
|
||||
:return: Generator (iterator) that yields found URLs.
|
||||
If the stop parameter is None the iterator will loop forever.
|
||||
"""
|
||||
# Set of hashes for the results found.
|
||||
# This is used to avoid repeated results.
|
||||
hashes = set()
|
||||
|
||||
# Count the number of links yielded.
|
||||
count = 0
|
||||
|
||||
# Prepare the search string.
|
||||
query = quote_plus(query)
|
||||
|
||||
# If no extra_params is given, create an empty dictionary.
|
||||
# We should avoid using an empty dictionary as a default value
|
||||
# in a function parameter in Python.
|
||||
if not extra_params:
|
||||
extra_params = {}
|
||||
|
||||
# Check extra_params for overlapping.
|
||||
for builtin_param in url_parameters:
|
||||
if builtin_param in extra_params.keys():
|
||||
raise ValueError(
|
||||
'GET parameter "%s" is overlapping with \
|
||||
the built-in GET parameter',
|
||||
builtin_param
|
||||
)
|
||||
|
||||
# Grab the cookie from the home page.
|
||||
get_page(url_home % vars(), user_agent, verify_ssl)
|
||||
|
||||
# Prepare the URL of the first request.
|
||||
if start:
|
||||
if num == 10:
|
||||
url = url_next_page % vars()
|
||||
else:
|
||||
url = url_next_page_num % vars()
|
||||
else:
|
||||
if num == 10:
|
||||
url = url_search % vars()
|
||||
else:
|
||||
url = url_search_num % vars()
|
||||
|
||||
# Loop until we reach the maximum result, if any (otherwise, loop forever).
|
||||
while not stop or count < stop:
|
||||
|
||||
# Remeber last count to detect the end of results.
|
||||
last_count = count
|
||||
|
||||
# Append extra GET parameters to the URL.
|
||||
# This is done on every iteration because we're
|
||||
# rebuilding the entire URL at the end of this loop.
|
||||
for k, v in extra_params.items():
|
||||
k = quote_plus(k)
|
||||
v = quote_plus(v)
|
||||
url = url + ('&%s=%s' % (k, v))
|
||||
|
||||
# Sleep between requests.
|
||||
# Keeps Google from banning you for making too many requests.
|
||||
time.sleep(pause)
|
||||
|
||||
# Request the Google Search results page.
|
||||
html = get_page(url, user_agent, verify_ssl)
|
||||
|
||||
# Parse the response and get every anchored URL.
|
||||
if is_bs4:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
else:
|
||||
soup = BeautifulSoup(html)
|
||||
try:
|
||||
anchors = soup.find(id='search').findAll('a')
|
||||
# Sometimes (depending on the User-agent) there is
|
||||
# no id "search" in html response...
|
||||
except AttributeError:
|
||||
# Remove links of the top bar.
|
||||
gbar = soup.find(id='gbar')
|
||||
if gbar:
|
||||
gbar.clear()
|
||||
anchors = soup.findAll('a')
|
||||
|
||||
# Process every anchored URL.
|
||||
for a in anchors:
|
||||
|
||||
# Get the URL from the anchor tag.
|
||||
try:
|
||||
link = a['href']
|
||||
except KeyError:
|
||||
continue
|
||||
|
||||
# Filter invalid links and links pointing to Google itself.
|
||||
link = filter_result(link)
|
||||
if not link:
|
||||
continue
|
||||
|
||||
# Discard repeated results.
|
||||
h = hash(link)
|
||||
if h in hashes:
|
||||
continue
|
||||
hashes.add(h)
|
||||
|
||||
# Yield the result.
|
||||
yield link
|
||||
|
||||
# Increase the results counter.
|
||||
# If we reached the limit, stop.
|
||||
count += 1
|
||||
if stop and count >= stop:
|
||||
return
|
||||
|
||||
# End if there are no more results.
|
||||
# XXX TODO review this logic, not sure if this is still true!
|
||||
if last_count == count:
|
||||
break
|
||||
|
||||
# Prepare the URL for the next request.
|
||||
start += num
|
||||
if num == 10:
|
||||
url = url_next_page % vars()
|
||||
else:
|
||||
url = url_next_page_num % vars()
|
||||
|
||||
|
||||
# Shortcut to single-item search.
|
||||
# Evaluates the iterator to return the single URL as a string.
|
||||
def lucky(*args, **kwargs):
|
||||
"""
|
||||
Shortcut to single-item search.
|
||||
|
||||
Same arguments as the main search function, but the return value changes.
|
||||
|
||||
:rtype: str
|
||||
:return: URL found by Google.
|
||||
"""
|
||||
return next(search(*args, **kwargs))
|
Binary file not shown.
Reference in New Issue
Block a user