asd

2024-11-29 18:15:30 +00:00
parent 40aade2d8e
commit bc9415586e
5298 changed files with 1938676 additions and 80 deletions
--- a/venv/lib/python3.12/site-packages/googlesearch/init.py
+++ b/venv/lib/python3.12/site-packages/googlesearch/init.py
@ -0,0 +1,376 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2009-2020, Mario Vilas
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice,this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the copyright holder nor the names of its
+#       contributors may be used to endorse or promote products derived from
+#       this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import random
+import sys
+import time
+import ssl
+
+if sys.version_info[0] > 2:
+    from http.cookiejar import LWPCookieJar
+    from urllib.request import Request, urlopen
+    from urllib.parse import quote_plus, urlparse, parse_qs
+else:
+    from cookielib import LWPCookieJar
+    from urllib import quote_plus
+    from urllib2 import Request, urlopen
+    from urlparse import urlparse, parse_qs
+
+try:
+    from bs4 import BeautifulSoup
+    is_bs4 = True
+except ImportError:
+    from BeautifulSoup import BeautifulSoup
+    is_bs4 = False
+
+__all__ = [
+
+    # Main search function.
+    'search',
+
+    # Shortcut for "get lucky" search.
+    'lucky',
+
+    # Miscellaneous utility functions.
+    'get_random_user_agent', 'get_tbs',
+]
+
+# URL templates to make Google searches.
+url_home = "https://www.google.%(tld)s/"
+url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
+             "btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
+             "cr=%(country)s"
+url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
+                "start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \
+                "cr=%(country)s"
+url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
+                 "num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
+                 "cr=%(country)s"
+url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
+                    "q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
+                    "safe=%(safe)s&cr=%(country)s"
+url_parameters = (
+    'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr')
+
+# Cookie jar. Stored at the user's home folder.
+# If the cookie jar is inaccessible, the errors are ignored.
+home_folder = os.getenv('HOME')
+if not home_folder:
+    home_folder = os.getenv('USERHOME')
+    if not home_folder:
+        home_folder = '.'   # Use the current folder on error.
+cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie'))
+try:
+    cookie_jar.load()
+except Exception:
+    pass
+
+# Default user agent, unless instructed by the user to change it.
+USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
+
+# Load the list of valid user agents from the install folder.
+# The search order is:
+#   * user_agents.txt.gz
+#   * user_agents.txt
+#   * default user agent
+try:
+    install_folder = os.path.abspath(os.path.split(__file__)[0])
+    try:
+        user_agents_file = os.path.join(install_folder, 'user_agents.txt.gz')
+        import gzip
+        fp = gzip.open(user_agents_file, 'rb')
+        try:
+            user_agents_list = [_.strip() for _ in fp.readlines()]
+        finally:
+            fp.close()
+            del fp
+    except Exception:
+        user_agents_file = os.path.join(install_folder, 'user_agents.txt')
+        with open(user_agents_file) as fp:
+            user_agents_list = [_.strip() for _ in fp.readlines()]
+except Exception:
+    user_agents_list = [USER_AGENT]
+
+
+# Get a random user agent.
+def get_random_user_agent():
+    """
+    Get a random user agent string.
+
+    :rtype: str
+    :return: Random user agent string.
+    """
+    return random.choice(user_agents_list)
+
+
+# Helper function to format the tbs parameter.
+def get_tbs(from_date, to_date):
+    """
+    Helper function to format the tbs parameter.
+
+    :param datetime.date from_date: Python date object.
+    :param datetime.date to_date: Python date object.
+
+    :rtype: str
+    :return: Dates encoded in tbs format.
+    """
+    from_date = from_date.strftime('%m/%d/%Y')
+    to_date = to_date.strftime('%m/%d/%Y')
+    return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars()
+
+
+# Request the given URL and return the response page, using the cookie jar.
+# If the cookie jar is inaccessible, the errors are ignored.
+def get_page(url, user_agent=None, verify_ssl=True):
+    """
+    Request the given URL and return the response page, using the cookie jar.
+
+    :param str url: URL to retrieve.
+    :param str user_agent: User agent for the HTTP requests.
+        Use None for the default.
+    :param bool verify_ssl: Verify the SSL certificate to prevent
+        traffic interception attacks. Defaults to True.
+
+    :rtype: str
+    :return: Web page retrieved for the given URL.
+
+    :raises IOError: An exception is raised on error.
+    :raises urllib2.URLError: An exception is raised on error.
+    :raises urllib2.HTTPError: An exception is raised on error.
+    """
+    if user_agent is None:
+        user_agent = USER_AGENT
+    request = Request(url)
+    request.add_header('User-Agent', user_agent)
+    cookie_jar.add_cookie_header(request)
+    if verify_ssl:
+        response = urlopen(request)
+    else:
+        context = ssl._create_unverified_context()
+        response = urlopen(request, context=context)
+    cookie_jar.extract_cookies(response, request)
+    html = response.read()
+    response.close()
+    try:
+        cookie_jar.save()
+    except Exception:
+        pass
+    return html
+
+
+# Filter links found in the Google result pages HTML code.
+# Returns None if the link doesn't yield a valid result.
+def filter_result(link):
+    try:
+
+        # Decode hidden URLs.
+        if link.startswith('/url?'):
+            o = urlparse(link, 'http')
+            link = parse_qs(o.query)['q'][0]
+
+        # Valid results are absolute URLs not pointing to a Google domain,
+        # like images.google.com or googleusercontent.com for example.
+        # TODO this could be improved!
+        o = urlparse(link, 'http')
+        if o.netloc and 'google' not in o.netloc:
+            return link
+
+    # On error, return None.
+    except Exception:
+        pass
+
+
+# Returns a generator that yields URLs.
+def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
+           stop=None, pause=2.0, country='', extra_params=None,
+           user_agent=None, verify_ssl=True):
+    """
+    Search the given query string using Google.
+
+    :param str query: Query string. Must NOT be url-encoded.
+    :param str tld: Top level domain.
+    :param str lang: Language.
+    :param str tbs: Time limits (i.e "qdr:h" => last hour,
+        "qdr:d" => last 24 hours, "qdr:m" => last month).
+    :param str safe: Safe search.
+    :param int num: Number of results per page.
+    :param int start: First result to retrieve.
+    :param int stop: Last result to retrieve.
+        Use None to keep searching forever.
+    :param float pause: Lapse to wait between HTTP requests.
+        A lapse too long will make the search slow, but a lapse too short may
+        cause Google to block your IP. Your mileage may vary!
+    :param str country: Country or region to focus the search on. Similar to
+        changing the TLD, but does not yield exactly the same results.
+        Only Google knows why...
+    :param dict extra_params: A dictionary of extra HTTP GET
+        parameters, which must be URL encoded. For example if you don't want
+        Google to filter similar results you can set the extra_params to
+        {'filter': '0'} which will append '&filter=0' to every query.
+    :param str user_agent: User agent for the HTTP requests.
+        Use None for the default.
+    :param bool verify_ssl: Verify the SSL certificate to prevent
+        traffic interception attacks. Defaults to True.
+
+    :rtype: generator of str
+    :return: Generator (iterator) that yields found URLs.
+        If the stop parameter is None the iterator will loop forever.
+    """
+    # Set of hashes for the results found.
+    # This is used to avoid repeated results.
+    hashes = set()
+
+    # Count the number of links yielded.
+    count = 0
+
+    # Prepare the search string.
+    query = quote_plus(query)
+
+    # If no extra_params is given, create an empty dictionary.
+    # We should avoid using an empty dictionary as a default value
+    # in a function parameter in Python.
+    if not extra_params:
+        extra_params = {}
+
+    # Check extra_params for overlapping.
+    for builtin_param in url_parameters:
+        if builtin_param in extra_params.keys():
+            raise ValueError(
+                'GET parameter "%s" is overlapping with \
+                the built-in GET parameter',
+                builtin_param
+            )
+
+    # Grab the cookie from the home page.
+    get_page(url_home % vars(), user_agent, verify_ssl)
+
+    # Prepare the URL of the first request.
+    if start:
+        if num == 10:
+            url = url_next_page % vars()
+        else:
+            url = url_next_page_num % vars()
+    else:
+        if num == 10:
+            url = url_search % vars()
+        else:
+            url = url_search_num % vars()
+
+    # Loop until we reach the maximum result, if any (otherwise, loop forever).
+    while not stop or count < stop:
+
+        # Remeber last count to detect the end of results.
+        last_count = count
+
+        # Append extra GET parameters to the URL.
+        # This is done on every iteration because we're
+        # rebuilding the entire URL at the end of this loop.
+        for k, v in extra_params.items():
+            k = quote_plus(k)
+            v = quote_plus(v)
+            url = url + ('&%s=%s' % (k, v))
+
+        # Sleep between requests.
+        # Keeps Google from banning you for making too many requests.
+        time.sleep(pause)
+
+        # Request the Google Search results page.
+        html = get_page(url, user_agent, verify_ssl)
+
+        # Parse the response and get every anchored URL.
+        if is_bs4:
+            soup = BeautifulSoup(html, 'html.parser')
+        else:
+            soup = BeautifulSoup(html)
+        try:
+            anchors = soup.find(id='search').findAll('a')
+            # Sometimes (depending on the User-agent) there is
+            # no id "search" in html response...
+        except AttributeError:
+            # Remove links of the top bar.
+            gbar = soup.find(id='gbar')
+            if gbar:
+                gbar.clear()
+            anchors = soup.findAll('a')
+
+        # Process every anchored URL.
+        for a in anchors:
+
+            # Get the URL from the anchor tag.
+            try:
+                link = a['href']
+            except KeyError:
+                continue
+
+            # Filter invalid links and links pointing to Google itself.
+            link = filter_result(link)
+            if not link:
+                continue
+
+            # Discard repeated results.
+            h = hash(link)
+            if h in hashes:
+                continue
+            hashes.add(h)
+
+            # Yield the result.
+            yield link
+
+            # Increase the results counter.
+            # If we reached the limit, stop.
+            count += 1
+            if stop and count >= stop:
+                return
+
+        # End if there are no more results.
+        # XXX TODO review this logic, not sure if this is still true!
+        if last_count == count:
+            break
+
+        # Prepare the URL for the next request.
+        start += num
+        if num == 10:
+            url = url_next_page % vars()
+        else:
+            url = url_next_page_num % vars()
+
+
+# Shortcut to single-item search.
+# Evaluates the iterator to return the single URL as a string.
+def lucky(*args, **kwargs):
+    """
+    Shortcut to single-item search.
+
+    Same arguments as the main search function, but the return value changes.
+
+    :rtype: str
+    :return: URL found by Google.
+    """
+    return next(search(*args, **kwargs))
--- a/venv/lib/python3.12/site-packages/googlesearch/user_agents.txt.gz
+++ b/venv/lib/python3.12/site-packages/googlesearch/user_agents.txt.gz