本文共 3346 字,大约阅读时间需要 11 分钟。
from selenium import webdriver#模拟鼠标操作from selenium.webdriver import ActionChains#键盘按键操作from selenium.webdriver.common.keys import Keysimport timefrom bs4 import BeautifulSoupdriver = webdriver.Chrome()#首页driver.get('https://www.jd.com/')#关键词keyword = input('请输入你要查询的产品:')#跳转地址url = 'https://search.jd.com/Search?' + keyword#time.sleep(1)#定位到搜索框search=driver.find_element_by_xpath('//*[@id="key"]')#鼠标点击搜索search.send_keys(keyword)#time.sleep(1)#按回车search.send_keys(Keys.ENTER)driver.get(url)#print(driver.page_source)#解析网页代码soup = BeautifulSoup(driver.page_source,'lxml')#print(soup.text)data = soup.find_all('li',class_='gl-item')#print(data)time.sleep(2)for data_s in data: #名字 name = data_s.find('div',class_='p-name').get_text() time.sleep(2) print(name) #店名 da = data_s.find('span',class_='J_im_icon').get_text() time.sleep(2) #print(da) # 价格 price = data_s.find('div',class_='p-price').text.replace('¥', '').replace(',','') time.sleep(2) print(price)
打开selenium浏览器驱动进行抓取
1.实例化一个谷歌浏览器驱动 2.打开网址 3。关键字 4.分享网页url地址进行拼接 5.使用selenium进行搜索框定位 6.使用selenium库模拟用户键盘鼠标操作 7.解析网页使用selenium配置bs4进行解析 8.分析网页标签 9.进行数据提取 10.保存数据 运行结果#函数代码
import seleniumimport csvimport timefrom bs4 import BeautifulSoup#1.获取urldef fetch(url): driver = webdriver.Chrome() driver.get(url) soup = BeautifulSoup(driver.page_source,'lxml') parse(soup)#解析def parse(soup): names = soup.find_all('div',{ 'class':'p-name'}) prcies = soup.find_all('span', { 'class': 'J_im_icon'}) output(names,prcies)#列表形式输出def output(names, prices): out = open('jingdong.csv','a', encoding='utf-8') csv_write = csv.writer(out,dialect='excel') for name,price in zip(names, prices): print("商品名称:" + name.get_text().strip(), u"\n商品价格: " + price.get_text().strip()) text=[name.get_text().strip(),price.get_text().strip()] csv_write.writerow(text) print('------'*10)if __name__ == '__main__': keyword = input('请输入需要查找的商品:') for i in range(3): url = 'https://search.jd.com/Search?keyword=' + keyword + '&enc=utf-8&page=' + str(i * 2 + 1) time.sleep(1) fetch(url) print('加载完成。')和上面代码一样只是用函数封装了一下 1.解析
import seleniumimport csvimport timefrom bs4 import BeautifulSoup#1.获取urldef fetch(url): driver = webdriver.Chrome() driver.get(url) soup = BeautifulSoup(driver.page_source,'lxml') parse(soup)
2.解析
def prase(soup): names = soup.find_all('div', { 'class': 'p-name p-name-type-2'}) prices = soup.find_all('div', { 'class': 'p-price'}) output(names, prices)
#3.#列表形式输出
#列表形式输出def output(names, prices): out = open('jingdong.csv','a', encoding='utf-8') csv_write = csv.writer(out,dialect='excel') for name,price in zip(names, prices): print("商品名称:" + name.get_text().strip(), u"\n商品价格: " + price.get_text().strip()) text=[name.get_text().strip(),price.get_text().strip()] csv_write.writerow(text) print('------'*10)
4.使用调用入口
if __name__ == '__main__': keyword = input('请输入需要查找的商品:') for i in range(3): url = 'https://search.jd.com/Search?keyword=' + keyword + '&enc=utf-8&page=' + str(i * 2 + 1) time.sleep(1) fetch(url) print('加载完成。')
总结
京东是动态页面加载异步渲染ajax请求普通的请求是解析不了网页获取不了信息的 所以我们这里用到selenium爬取 针对动态网页加载的框架 直接解析就好了很方便的缺点就是有点慢啊 必经是模拟浏览器进行操作的 下一篇介绍用fiddler进行抓包分析提取ajax里面的参数进行爬取转载地址:http://moywi.baihongyu.com/