目录
Python爬虫:爬取豆瓣电影中速度与激情8演员图片
Python爬虫:斗鱼弹幕相关信息保存到mongodb
Python爬虫:抓取喜马拉雅电台音频
Python爬虫—抓包分析爬取实习僧全部招聘信息
Python爬虫:批量抓取花瓣网高清美图并保存
Python爬虫:爬取v2ex数据用csv保存
Python爬虫:豌豆荚设计奖三种爬取方法速度对比
Python爬虫:使用lxml解析HTML,输出对应值
Python爬虫:使用Selenium爬取一点资讯动态数据
Python爬虫:Selenium+xpath+bs4爬取亚马逊数据保存到mongodb
Python爬虫:获取黑大验证码并登录
Python爬虫:爬取豆瓣电影中速度与激情8演员图片
- import urllib.request
- import os
- import re
-
-
- def douban(url):
- r = urllib.request.urlopen(url)
- html = r.read().decode('utf-8')
- result = re.findall(r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html)
- result2 = re.findall(r'(?<=title=").\S+', html)
- result2.pop()
- result3 = sorted(set(result2), key=result2.index)
- result3.pop(-3)
- if not os.path.exists('douban'):
- os.makedirs('douban')
- i = 0
- for link in result:
- filename = 'douban\\' + str(result3[i]) + '.jpg'
- i += 1
- with open(filename, 'w') as file:
- urllib.request.urlretrieve(link, filename)
-
-
- url = 'https://movie.douban.com/subject/26260853/celebrities'
- if __name__ == '__main__':
- douban(url)
Python爬虫:斗鱼弹幕相关信息保存到mongodb
- # 这个抓取弹幕,然后把用户的uid,昵称,等级,弹幕内容都保存到mongodb中
- __author__ = '布咯咯_rieuse'
- __time__ = '2017.6.2'
- __github__ = 'https://github.com/rieuse'
-
- import multiprocessing
- import re
- import socket
- import time
-
- import pymongo
- import requests
- from bs4 import BeautifulSoup
-
- clients = pymongo.MongoClient('localhost')
- db = clients["DouyuTV_danmu"]
- col = db["info"]
-
- client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- host = socket.gethostbyname("openbarrage.douyutv.com")
- port = 8601
- client.connect((host, port))
-
- danmu_path = re.compile(b'txt@=(.+?)/cid@')
- uid_path = re.compile(b'uid@=(.+?)/nn@')
- nickname_path = re.compile(b'nn@=(.+?)/txt@')
- level_path = re.compile(b'level@=([1-9][0-9]?)/sahf')
-
-
- def sendmsg(msgstr):
- msg = msgstr.encode('utf-8')
- data_length = len(msg) + 8
- code = 689
- msgHead = int.to_bytes(data_length, 4, 'little') \
- + int.to_bytes(data_length, 4, 'little') + int.to_bytes(code, 4, 'little')
- client.send(msgHead)
- sent = 0
- while sent < len(msg):
- tn = client.send(msg[sent:])
- sent = sent + tn
-
-
- def start(roomid):
- msg = 'type@=loginreq/username@=rieuse/password@=douyu/roomid@={}/\0'.format(roomid)
- sendmsg(msg)
- msg_more = 'type@=joingroup/rid@={}/gid@=-9999/\0'.format(roomid)
- sendmsg(msg_more)
-
- print('---------------欢迎连接到{}的直播间---------------'.format(get_name(roomid)))
- while True:
- data = client.recv(1024)
- uid_more = uid_path.findall(data)
- nickname_more = nickname_path.findall(data)
- level_more = level_path.findall(data)
- danmu_more = danmu_path.findall(data)
- if not level_more:
- level_more = b'0'
- if not data:
- break
- else:
- for i in range(0, len(danmu_more)):
- try:
- product = {
- 'uid': uid_more[0].decode(encoding='utf-8'),
- 'nickname': nickname_more[0].decode(encoding='utf-8'),
- 'level': level_more[0].decode(encoding='utf-8'),
- 'danmu': danmu_more[0].decode(encoding='utf-8')
- }
- print(product)
- col.insert(product)
- print('成功导入mongodb')
- except Exception as e:
- print(e)
-
-
- def keeplive():
- while True:
- msg = 'type@=keeplive/tick@=' + str(int(time.time())) + '/\0'
- sendmsg(msg)
- time.sleep(15)
-
-
- def get_name(roomid):
- r = requests.get("http://www.douyu.com/" + roomid)
- soup = BeautifulSoup(r.text, 'lxml')
- return soup.find('a', {'class', 'zb-name'}).string
-
-
- if __name__ == '__main__':
- room_id = input('请出入房间ID: ')
- p1 = multiprocessing.Process(target=start, args=(room_id,))
- p2 = multiprocessing.Process(target=keeplive)
- p1.start()
- p2.start()
Python爬虫:抓取喜马拉雅电台音频
- _author__ = '布咯咯_rieuse'
-
- import json
- import random
- import time
- import pymongo
- import requests
- from bs4 import BeautifulSoup
- from lxml import etree
-
- clients = pymongo.MongoClient('localhost')
- db = clients["XiMaLaYa"]
- col1 = db["album2"]
- col2 = db["detaile2"]
-
- UA_LIST = [
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
- "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
- "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
- ]
- headers1 = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Encoding': 'gzip, deflate, sdch',
- 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
- 'Cache-Control': 'max-age=0',
- 'Proxy-Connection': 'keep-alive',
- 'Upgrade-Insecure-Requests': '1',
- 'User-Agent': random.choice(UA_LIST)
- }
- headers2 = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Encoding': 'gzip, deflate, sdch',
- 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
- 'Cache-Control': 'max-age=0',
- 'Proxy-Connection': 'keep-alive',
- 'Referer': 'http://www.ximalaya.com/dq/all/2',
- 'Upgrade-Insecure-Requests': '1',
- 'User-Agent': random.choice(UA_LIST)
- }
-
-
- def get_url():
- start_urls = ['http://www.ximalaya.com/dq/all/{}'.format(num) for num in range(1, 85)]
- for start_url in start_urls:
- html = requests.get(start_url, headers=headers1).text
- soup = BeautifulSoup(html, 'lxml')
- for item in soup.find_all(class_="albumfaceOutter"):
- content = {
- 'href': item.a['href'],
- 'title': item.img['alt'],
- 'img_url': item.img['src']
- }
- col1.insert(content)
- print('写入一个频道' + item.a['href'])
- print(content)
- another(item.a['href'])
- time.sleep(1)
-
-
- def another(url):
- html = requests.get(url, headers=headers2).text
- ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')
- if len(ifanother):
- num = ifanother[0]
- print('本频道资源存在' + num + '个页面')
- for n in range(1, int(num)):
- print('开始解析{}个中的第{}个页面'.format(num, n))
- url2 = url + '?page={}'.format(n)
- get_m4a(url2)
- get_m4a(url)
-
-
- def get_m4a(url):
- time.sleep(1)
- html = requests.get(url, headers=headers2).text
- numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')
- for i in numlist:
- murl = 'http://www.ximalaya.com/tracks/{}.json'.format(i)
- html = requests.get(murl, headers=headers1).text
- dic = json.loads(html)
- col2.insert(dic)
- print(murl + '中的数据已被成功插入mongodb')
-
-
- if __name__ == '__main__':
- get_url()
Python爬虫—抓包分析爬取实习僧全部招聘信息
- import json
- import requests
- import pymongo
- import time
-
- clients = pymongo.MongoClient('localhost')
- db = clients["Shixiseng"]
- col = db["detail_info"]
-
- urls = ['http://www.shixiseng.com/app/internsvt?c=%E5%85%A8%E5%9B%BD&p={}&t=hot'.format(n) for n in range(1, 3487)]
- for url in urls:
- print(url)
- r = requests.get(url)
- html = r.content.decode('utf-8')
- content = json.loads(html)['msg']['b']
- for i in content:
- print('插入一条数据:')
- print(i)
- col.insert(i)
- time.sleep(0.01)
Python爬虫:批量抓取花瓣网高清美图并保存
- __author__ = '布咯咯_rieuse'
-
- import os
- import lxml.html
- import requests
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.ui import WebDriverWait
-
- SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
- browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
- # browser = webdriver.Firefox()
- wait = WebDriverWait(browser, 5)
- browser.set_window_size(1400, 900)
-
-
- def parser(url, param):
- # 解析模块
- browser.get(url)
- wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, param)))
- html = browser.page_source
- doc = lxml.html.fromstring(html)
- return doc
-
-
- def get_main_url():
- print('打开主页搜寻链接中...')
- try:
- doc = parser('http://huaban.com/boards/favorite/beauty/', '#waterfall')
- name = doc.xpath('//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()')
- u = doc.xpath('//*[@id="waterfall"]/div/a[1]/@href')
- for item, fileName in zip(u, name):
- main_url = 'http://huaban.com' + item
- print('主链接已找到' + main_url)
- if '*' in fileName:
- fileName = fileName.replace('*', '')
- download(main_url, fileName)
- except Exception as e:
- print(e)
-
-
- def download(main_url, fileName):
- print('-------准备下载中-------')
- try:
- doc = parser(main_url, '#waterfall')
- if not os.path.exists('image\\' + fileName):
- print('创建文件夹...')
- os.makedirs('image\\' + fileName)
- link = doc.xpath('//*[@id="waterfall"]/div/a/@href')
- # print(link)
- i = 0
- for item in link:
- i += 1
- minor_url = 'http://huaban.com' + item
- doc = parser(minor_url, '#pin_view_page')
- img_url = doc.xpath('//*[@id="baidu_image_holder"]/a/img/@src')
- img_url2 = doc.xpath('//*[@id="baidu_image_holder"]/img/@src')
- img_url += img_url2
- try:
- url = 'http:' + str(img_url[0])
- print('正在下载第' + str(i) + '张图片,地址:' + url)
- r = requests.get(url)
- filename = 'image\\{}\\'.format(fileName) + str(i) + '.jpg'
- with open(filename, 'wb') as fo:
- fo.write(r.content)
- except Exception:
- print('出错了!')
- except Exception:
- print('出错啦!')
-
-
- if __name__ == '__main__':
- get_main_url()
Python爬虫:爬取v2ex数据用csv保存
- import csv, requests, re
- from bs4 import BeautifulSoup
-
- url = 'https://www.v2ex.com/?tab=all'
- html = requests.get(url).text
- soup = BeautifulSoup(html, 'html.parser')
- articles = []
- for article in soup.find_all(class_='cell item'):
- title = article.find(class_='item_title').get_text()
- category = article.find(class_='node').get_text()
- author = re.findall(r'(?<=<a href="/member/).+(?="><img)', str(article))[0]
- u = article.select('.item_title > a')
- link = 'https://www.v2ex.com' + re.findall(r'(?<=href=").+(?=")', str(u))[0]
- articles.append([title, category, author, link])
-
- with open(r'document\v2ex.csv', 'w') as f:
- writer = csv.writer(f)
- writer.writerow(['文章标题', '分类', '作者', '文章地址'])
- for row in articles:
- writer.writerow(row)
Python爬虫:豌豆荚设计奖三种爬取方法速度对比
- __author__ = '布咯咯_rieuse'
-
- import asyncio
- import random
- import time
- import aiohttp
- import pymongo
- import requests
- import multiprocessing
- from bs4 import BeautifulSoup
-
- # 共用部分
- clients = pymongo.MongoClient('localhost')
- db = clients["wandoujia"]
- col = db["info"]
-
- urls = ['http://www.wandoujia.com/award?page={}'.format(num) for num in range(1, 46)]
- UA_LIST = [
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
- "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
- "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
- ]
- headers = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Encoding': 'gzip, deflate, sdch',
- 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
- 'Connection': 'keep-alive',
- 'Host': 'www.wandoujia.com',
- 'User-Agent': random.choice(UA_LIST)
- }
-
- proxies = {
- 'http': 'http://123.206.6.17:3128',
- 'https': 'http://123.206.6.17:3128'
- }
-
-
- # 方式一:使用常见的requests
- def method_1():
- start = time.time()
- for url in urls:
- html = requests.get(url, headers=headers, proxies=proxies).text
- soup = BeautifulSoup(html, 'lxml')
- title = soup.find_all(class_='title')
- app_title = soup.find_all(class_='app-title')
- item_cover = soup.find_all(class_='item-cover')
- icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')
- for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
- content = {
- 'title': title_i.get_text(),
- 'app_title': app_title_i.get_text(),
- 'item_cover': item_cover_i['data-original'],
- 'icon_cover': icon_cover_i['data-original']
- }
- col.insert(content)
- print('成功插入一组数据' + str(content))
- print('一共用时:' + str(time.time() - start))
-
-
- # if __name__ == '__main__':
- # method_1()
-
-
-
-
-
- # 方式二:使用Requests + Pool
- def method_2(url):
- html = requests.get(url, headers=headers, proxies=proxies).text
- soup = BeautifulSoup(html, 'lxml')
- title = soup.find_all(class_='title')
- app_title = soup.find_all(class_='app-title')
- item_cover = soup.find_all(class_='item-cover')
- icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')
- for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
- content = {
- 'title': title_i.get_text(),
- 'app_title': app_title_i.get_text(),
- 'item_cover': item_cover_i['data-original'],
- 'icon_cover': icon_cover_i['data-original']
- }
- # time.sleep(1)
- col.insert(content)
- print('成功插入一组数据' + str(content))
-
-
- # if __name__ == '__main__':
- # start = time.time()
- # pool = multiprocessing.Pool(4)
- # pool.map(method_2, urls)
- # pool.close()
- # pool.join()
- # print('一共用时:' + str(time.time() - start))
-
-
- # 方式三:使用Asyncio + Aiohttp python3.4之后出的异步io模块
-
- def method_3():
- async def get_url(url):
- async with aiohttp.ClientSession() as session: # async关键字将一个函数声明为协程函数,函数执行时返回一个协程对象。
- async with session.get(url) as html:
- response = await html.text(encoding="utf-8") # await关键字将暂停协程函数的执行,等待异步IO返回结果。
- return response
-
- async def parser(url):
- html = await get_url(url)
- soup = BeautifulSoup(html, 'lxml')
- title = soup.find_all(class_='title')
- app_title = soup.find_all(class_='app-title')
- item_cover = soup.find_all(class_='item-cover')
- icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')
- for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
- content = {
- 'title': title_i.get_text(),
- 'app_title': app_title_i.get_text(),
- 'item_cover': item_cover_i['data-original'],
- 'icon_cover': icon_cover_i['data-original']
- }
- col.insert(content)
- print('成功插入一组数据' + str(content))
-
- start = time.time()
- loop = asyncio.get_event_loop()
- tasks = [parser(url) for url in urls]
- loop.run_until_complete(asyncio.gather(*tasks))
- print(time.time() - start)
-
- if __name__ == '__main__':
- method_3()
- © 2021 GitHub, Inc.
Python爬虫:使用lxml解析HTML,输出对应值
- import requests
- import lxml.html
-
- url = 'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml'
- html = requests.get(url).text
- doc = lxml.html.fromstring(html)
- titles = doc.xpath('//div[@class="newsList"]/ul/li/a/text()')
- href = doc.xpath('//div[@class="newsList"]/ul/li/a/@href')
- i = 0
- for content in titles:
- results = {
- '标题': titles[i],
- '链接': href[i]
- }
- i += 1
- print(results)
Python爬虫:使用Selenium爬取一点资讯动态数据
- from selenium.webdriver.common.keys import Keys
- from selenium import webdriver
- from bs4 import BeautifulSoup
- import csv
-
- driver = webdriver.Firefox()
- driver.implicitly_wait(3)
- first_url = 'http://www.yidianzixun.com/channel/c6'
- driver.get(first_url)
- driver.find_element_by_class_name('icon-refresh').click()
- for i in range(1, 90):
- driver.find_element_by_class_name('icon-refresh').send_keys(Keys.DOWN)
- soup = BeautifulSoup(driver.page_source, 'lxml')
- articles = []
- for article in soup.find_all(class_='item doc style-small-image style-content-middle'):
- title = article.find(class_='doc-title').get_text()
- source = article.find(class_='source').get_text()
- comment = article.find(class_='comment-count').get_text()
- link = 'http://www.yidianzixun.com' + article.get('href')
- articles.append([title, source, comment, link])
- driver.quit()
- with open(r'document\yidian.csv', 'w') as f:
- writer = csv.writer(f)
- writer.writerow(['文章标题', '作者', '评论数', '文章地址'])
- for row in articles:
- writer.writerow(row)
Python爬虫:Selenium+xpath+bs4爬取亚马逊数据保存到mongodb
- from selenium.common.exceptions import TimeoutException
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium import webdriver
- from bs4 import BeautifulSoup
- import lxml.html
- import pymongo
- import re
-
- MONGO_URL = 'localhost'
- MONGO_DB = 'amazon'
- MONGO_TABLE = 'amazon-python'
- SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
- KEYWORD = 'python'
- client = pymongo.MongoClient(MONGO_URL)
- db = client[MONGO_DB]
-
- browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
- # browser = webdriver.Firefox()
- wait = WebDriverWait(browser, 10)
- browser.set_window_size(1400, 900)
-
-
- def search():
- print('正在搜索')
- try:
- browser.get('https://www.amazon.cn/')
- input = wait.until(
- EC.presence_of_element_located((By.CSS_SELECTOR, '#twotabsearchtextbox'))
- )
- submit = wait.until(
- EC.element_to_be_clickable((By.CSS_SELECTOR, '#nav-search > form > div.nav-right > div > input')))
- input.send_keys(KEYWORD)
- submit.click()
- total = wait.until(
- EC.presence_of_element_located((By.CSS_SELECTOR, '#pagn > span.pagnDisabled')))
- get_products()
- print('一共' + total.text + '页')
- return total.text
- except TimeoutException:
- return search()
-
-
- def next_page(number):
- print('正在翻页', number)
- try:
- wait.until(EC.text_to_be_present_in_element(
- (By.CSS_SELECTOR, '#pagnNextString'), '下一页'))
- submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#pagnNextString')))
- submit.click()
- wait.until(EC.text_to_be_present_in_element(
- (By.CSS_SELECTOR, '.pagnCur'), str(number)))
- get_products()
- except TimeoutException:
- next_page(number)
-
-
- def get_products():
- try:
- wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#s-results-list-atf')))
- html = browser.page_source
- soup = BeautifulSoup(html, 'lxml')
- doc = lxml.html.fromstring(html)
- date = doc.xpath('//*[@class="s-result-item celwidget "]/div/div[2]/div[1]/span[2]/text()')
- content = soup.find_all(attrs={"id": re.compile(r'result_\d+')})
- for item, time in zip(content, date):
- product = {
- 'title': item.find(class_='s-access-title').get_text(),
- 'image': item.find(class_='s-access-image cfMarker').get('src'),
- 'price': item.find(class_='a-size-base a-color-price s-price a-text-bold').get_text(),
- 'date': time
- }
- # save_to_mongo(product)
- print(product)
- except Exception as e:
- print(e)
-
-
- def save_to_mongo(result):
- try:
- if db[MONGO_TABLE].insert(result):
- print('存储到mongodb成功', result)
- except Exception:
- print('存储到mongodb失败', result)
-
-
- def main():
- try:
- total = int(search())
- for i in range(2, total + 1):
- next_page(i)
- except Exception as e:
- print('出错啦', e)
- finally:
- browser.close()
-
-
- if __name__ == '__main__':
- main()
Python爬虫:获取黑大验证码并登录
- import requests
- from PIL import Image
- from bs4 import BeautifulSoup
-
- url1 = 'http://my.hlju.edu.cn/captchaGenerate.portal?'
- url2 = 'http://my.hlju.edu.cn/userPasswordValidate.portal'
- url3 = 'http://my.hlju.edu.cn/index.portal'
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
- }
- s = requests.session()
- response = s.get(url1, headers=headers)
- html = response.text
- soup = BeautifulSoup(html, 'html.parser')
- with open('img\code.jpg', 'wb') as f:
- f.write(response.content)
- img = Image.open('img\code.jpg')
- img.show()
- data = {}
- data['Login.Token1'] = '20154433'
- data['Login.Token2'] = ''
- data['captcha'] = input('输入验证码:')
- data['goto'] = 'http://my.hlju.edu.cn/loginSuccess.portal'
- data['gotoOnFail'] = 'http://my.hlju.edu.cn/loginFailure.portal'
- response2 = s.post(url=url2, data=data, headers=headers)
- response3 = s.get(url3, headers=headers)
- print(response3.text)