会员可以在此提问,百战程序员老师有问必答
对大家有帮助的问答会被标记为“推荐”
看完课程过来浏览一下别人提的问题,会帮你学得更全面
截止目前,同学们一共提了 128778个问题
Python 全系列/第十五阶段:Python 爬虫开发/爬虫反反爬 61楼
Python 全系列/第十五阶段:Python 爬虫开发/移动端爬虫 62楼
Python 全系列/第十五阶段:Python 爬虫开发/scrapy框架使用 65楼

from time import sleep  # 导入sleep模块,用于控制程序休眠
from selenium.webdriver import Edge, EdgeOptions  # 导入Edge和EdgeOptions类,用于创建Edge浏览器驱动
from selenium.webdriver.common.by import By  # 导入By类,用于定义定位元素的方式
from lxml import etree  # 导入etree模块,用于解析HTML源码


def huya():  # 定义一个名为huya的函数
    options = EdgeOptions()  # 创建一个EdgeOptions对象
    edge = Edge(options=options)  # 使用EdgeOptions对象创建一个Edge浏览器实例
    # 向浏览器发送命令,向新文档添加一个脚本
    edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
         Object.defineProperty(navigator, 'webdriver', {
         get: () => false
         })
       """
    })
    edge.get("https://www.huya.com/g/lol")  # 打开指定网页
    edge.implicitly_wait(10)  # 设置隐式等待时间为10秒
    edge.switch_to.frame('UDBSdkLgn_iframe')  # 切换到名称为'UDBSdkLgn_iframe'的iframe框架
    # 定位id为'close-udbLogin'的元素,并点击
    edge.find_element(By.ID, 'close-udbLogin').click()
    count = 1  # 初始化计数器为1
    while True:  # 进入循环,直到条件不满足为止
        sleep(1)  # 休眠1秒
        e = etree.HTML(edge.page_source)  # 将浏览器页面源码解析为HTML对象
        names = e.xpath('//a[@class="title"]/text()')  # 使用xpath定位方式获取所有a标签中class为"title"的文本内容
        persons = e.xpath('//i[@class="nick"]/text()')  # 使用xpath定位方式获取所有i标签中class为"nick"的文本内容
        hots = e.xpath('//i[@class="js-num"]/text()')  # 使用xpath定位方式获取所有i标签中class为"js-num"的文本内容
        for name, person, hot in zip(names, persons, hots):  # 遍历name、person和hot列表
            print(f'直播间:{name},主播:{person},热度:{hot}')  # 打印直播间名称、主播和热度信息
        print(f'第{count}页')  # 打印当前页码
        count += 1  # 计数器加1
        try:
            next_btn = edge.find_element(By.XPATH, '//a[@class="laypage_next"]')  # 定位class为"laypage_next"的元素
            next_btn.click()  # 点击定位到的元素
        except Exception as e:  # 如果发生异常,捕获异常信息并命名为e
            break  # 跳出循环

    edge.quit()  # 关闭浏览器


if __name__ == '__main__':  # 如果程序作为主程序运行
    huya()  # 调用huya函数

while True下面那个sleep不写的话就太快了导致点不到下一页,但隐式等待没起到作用吗?

Python 全系列/第十五阶段:Python 爬虫开发/爬虫基础 66楼

老师,我这个是?

PS D:\vscodeproject2\爬虫\Scarpy\scarpy05> & D:/python_env/spider2_env_/Scripts/Activate.ps1

(spider2_env_) PS D:\vscodeproject2\爬虫\Scarpy\scarpy05> & D:/python_env/spider2_env_/Scripts/python.exe d:/vscodeproject2/爬虫/Scarpy/scarpy05/scarpy05/begin.py

2023-12-26 22:01:30 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scarpy05)

2023-12-26 22:01:30 [scrapy.utils.log] INFO: Versions: lxml 4.8.0.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.4.0, Python 3.10.0 (tags/v3.10.0:b494f59, Oct  4 2021, 19:00:18) [MSC v.1929 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 1.1.1n  15 Mar 2022), cryptography 36.0.2, Platform Windows-10-10.0.19045-SP0

2023-12-26 22:01:30 [scrapy.crawler] INFO: Overridden settings:

{'BOT_NAME': 'scarpy05',

 'NEWSPIDER_MODULE': 'scarpy05.spiders',

 'SPIDER_MODULES': ['scarpy05.spiders']}

2023-12-26 22:01:30 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor

2023-12-26 22:01:30 [scrapy.extensions.telnet] INFO: Telnet Password: 2e9c559873783f27

2023-12-26 22:01:30 [scrapy.middleware] INFO: Enabled extensions:

['scrapy.extensions.corestats.CoreStats',

 'scrapy.extensions.telnet.TelnetConsole',

 'scrapy.extensions.logstats.LogStats']

2023-12-26 22:01:30 [scrapy.middleware] INFO: Enabled downloader middlewares:

['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',

 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',   

 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',

 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',

 'scrapy.downloadermiddlewares.retry.RetryMiddleware',

 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',

 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',

 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',

 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',

 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',

 'scrapy.downloadermiddlewares.stats.DownloaderStats']

2023-12-26 22:01:30 [scrapy.middleware] INFO: Enabled spider middlewares:

['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',

 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',

 'scrapy.spidermiddlewares.referer.RefererMiddleware',

 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',

 'scrapy.spidermiddlewares.depth.DepthMiddleware']

Unhandled error in Deferred:

2023-12-26 22:01:30 [twisted] CRITICAL: Unhandled error in Deferred:


Traceback (most recent call last):

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\crawler.py", line 206, in crawl

    return self._crawl(crawler, *args, **kwargs)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\crawler.py", line 210, in _crawl

    d = crawler.crawl(*args, **kwargs)

  File "D:\python_env\spider2_env_\lib\site-packages\twisted\internet\defer.py", line 1905, in unwindGenerator

    return _cancellableInlineCallbacks(gen)

  File "D:\python_env\spider2_env_\lib\site-packages\twisted\internet\defer.py", line 1815, in _cancellableInlineCallbacks

    _inlineCallbacks(None, gen, status)

--- <exception caught here> ---

  File "D:\python_env\spider2_env_\lib\site-packages\twisted\internet\defer.py", line 1660, in _inlineCallbacks

    result = current_context.run(gen.send, result)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\crawler.py", line 102, in crawl

    self.engine = self._create_engine()

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\crawler.py", line 116, in _create_engine

    return ExecutionEngine(self, lambda _: self.stop())

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\core\engine.py", line 84, in __init__

    self.scraper = Scraper(crawler)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\core\scraper.py", line 75, in __init__

    self.itemproc = itemproc_cls.from_crawler(crawler)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\middleware.py", line 59, in from_crawler

    return cls.from_settings(crawler.settings, crawler)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\middleware.py", line 41, in from_settings

    mw = create_instance(mwcls, settings, crawler)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\utils\misc.py", line 166, in create_instance

    instance = objcls.from_crawler(crawler, *args, **kwargs)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\pipelines\media.py", line 76, in from_crawler

    pipe = cls.from_settings(crawler.settings)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\pipelines\images.py", line 112, in from_settings

    return cls(store_uri, settings=settings)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\pipelines\images.py", line 55, in __init__

    super().__init__(store_uri, settings=settings, download_func=download_func)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\pipelines\files.py", line 329, in __init__

    self.store = self._get_store(store_uri)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\pipelines\files.py", line 378, in _get_store

    store_cls = self.STORE_SCHEMES[scheme]

builtins.KeyError: 'd'


2023-12-26 22:01:30 [twisted] CRITICAL:

Traceback (most recent call last):

  File "D:\python_env\spider2_env_\lib\site-packages\twisted\internet\defer.py", line 1660, in _inlineCallbacks

    result = current_context.run(gen.send, result)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\crawler.py", line 102, in crawl

    self.engine = self._create_engine()

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\crawler.py", line 116, in _create_engine

    return ExecutionEngine(self, lambda _: self.stop())

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\core\engine.py", line 84, in __init__

    self.scraper = Scraper(crawler)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\core\scraper.py", line 75, in __init__

    self.itemproc = itemproc_cls.from_crawler(crawler)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\middleware.py", line 59, in from_crawler

    return cls.from_settings(crawler.settings, crawler)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\middleware.py", line 41, in from_settings

    mw = create_instance(mwcls, settings, crawler)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\utils\misc.py", line 166, in create_instance

    instance = objcls.from_crawler(crawler, *args, **kwargs)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\pipelines\media.py", line 76, in from_crawler

    pipe = cls.from_settings(crawler.settings)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\pipelines\images.py", line 112, in from_settings

    return cls(store_uri, settings=settings)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\pipelines\images.py", line 55, in __init__

    super().__init__(store_uri, settings=settings, download_func=download_func)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\pipelines\files.py", line 329, in __init__

    self.store = self._get_store(store_uri)

  File "D:\python_env\spider2_env_\lib\site-packages\scrapy\pipelines\files.py", line 378, in _get_store

    store_cls = self.STORE_SCHEMES[scheme]

KeyError: 'd'

(spider2_env_) PS D:\vscodeproject2\爬虫\Scarpy\scarpy05>


Python 全系列/第十五阶段:Python 爬虫开发/scrapy框架使用 68楼
Python 全系列/第十五阶段:Python 爬虫开发/爬虫基础 71楼
Python 全系列/第十五阶段:Python 爬虫开发/爬虫基础 73楼
Python 全系列/第十五阶段:Python 爬虫开发/爬虫基础 74楼
Python 全系列/第十五阶段:Python 爬虫开发/爬虫基础 75楼

百战程序员微信公众号

百战程序员微信小程序

©2014-2024 百战汇智(北京)科技有限公司 All Rights Reserved 北京亦庄经济开发区科创十四街 赛蒂国际工业园
网站维护:百战汇智(北京)科技有限公司
京公网安备 11011402011233号    京ICP备18060230号-3    营业执照    经营许可证:京B2-20212637