Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Qin-Xy-Auction/spider.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
139 lines (120 sloc)
4.17 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# coding:utf-8 | |
import urllib | |
import re | |
import time | |
from bs4 import BeautifulSoup | |
import requests | |
import json | |
s = requests.session() | |
headers2 = { | |
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', | |
'Content-Type': 'text/html; charset=gbk' | |
} | |
def genersDicByTxt(name): | |
inf = open(name + ".txt") | |
lines = inf.readlines() | |
dic = {} | |
for x in lines: | |
arr = x.split(': ') | |
dic[arr[0]] = arr[1].replace('\n','') | |
pass | |
inf.close() | |
return dic | |
# 根据url获取网页html内容 | |
def getHtmlContent(url): | |
obj=s.get(url,headers=headers2) | |
# print(obj) | |
str = obj.content | |
# str.decode('gb2312') | |
return str | |
def getA(html): | |
ISOTIMEFORMAT='%Y-%m-%d %X' | |
aReg = re.compile(r'<a .+? href="(.+?.com.+?)>') | |
aArr = re.findall(aReg,html) | |
acount =1 | |
for aurl in aArr: | |
print(aurl) | |
urllib.urlretrieve(aurl,filename='./%s %d.html' %(time.strftime( ISOTIMEFORMAT, time.localtime( time.time() ) ),acount | |
), reporthook=None, data=None) | |
# html1 = getHtmlContent(aurl) | |
# getA(html1) | |
acount=acount+1 | |
return aArr | |
# 从html中解析出所有jpg图片的url | |
# 百度贴吧html中jpg图片的url格式为:<img ... src="XXX.jpg" width=...> | |
def getJPGs(html): | |
# 解析jpg图片url的正则 | |
jpgReg = re.compile(r'<img class="lazy" src="(.+?\.jpeg)"') # 注:这里最后加一个'width'是为了提高匹配精确度 | |
# 解析出jpg的url列表 | |
jpgs = re.findall(jpgReg,html) | |
return jpgs | |
# 用图片url下载图片并保存成制定文件名 | |
def downloadJPG(imgUrl,fileName): | |
urllib.urlretrieve(imgUrl,fileName) | |
# 批量下载图片,默认保存到当前目录下 | |
def batchDownloadJPGs(imgUrls,path = './'): | |
# 用于给图片命名 | |
count = 1 | |
for url in imgUrls: | |
downloadJPG(url,''.join([path,'{0}.jpg'.format(count)])) | |
count = count + 1 | |
# 去空格和换行 | |
def clearSpace(astring): | |
return astring.strip().replace("\n", "") | |
# 封装:从百度贴吧网页下载图片 | |
def download(url): | |
html = getHtmlContent(url) | |
arr=getA(html) | |
jpgs = getJPGs(html) | |
print(jpgs) | |
# batchDownloadJPGs(jpgs) | |
def main(): | |
for x in xrange(1,4): | |
url = 'https://www.woyaogexing.com/tupian/z/zyx/' | |
download(url) | |
time.sleep(2) | |
pass | |
# 资讯 | |
def getzixun(): | |
for x in range(1,46): | |
soup =BeautifulSoup(getHtmlContent('http://maomijiaoyi.com/index.php?/xinwenliebiao_3_'+str(x)+'--15.html'),features="lxml") | |
lis = soup.findAll('li') | |
div = soup.find('div',attrs={"class":'zixun_left'}) | |
zixun_items = div.findAll('a',attrs={"class":'zixun_item'}) | |
print(x) | |
f1 = open("zixun" + '.txt','a') | |
for x in zixun_items: | |
adic = {} | |
img = 'http://maomijiaoyi.com'+ x.find('img').get('src') | |
zixun_title = clearSpace(x.find('div',attrs={"class":'zixun_title'}).get_text()) | |
zixun_text = clearSpace(x.find('div',attrs={"class":'zixun_text'}).get_text()) | |
zixun_time = clearSpace(x.find('div',attrs={"class":'zixun_time'}).get_text()) | |
zixun_item = x.get('href') | |
adic["img"]=img | |
adic["href"]='http://maomijiaoyi.com'+zixun_item | |
adic["zixun_title"]=zixun_title | |
adic["zixun_text"]=zixun_text | |
adic["zixun_time"]=zixun_time | |
f1.write(json.dumps(adic)+'\n') | |
time.sleep(4) | |
# 爬取比分 | |
def getbifen(): | |
for x in range(1,5): | |
start = 20*x | |
url = 'https://www.douban.com/j/search?q=%E5%8A%A8%E6%BC%AB&start='+str(start)+'&subtype=item' | |
soup =BeautifulSoup(getHtmlContent(url),features="lxml") | |
live1 = soup.find('h3') | |
live2 = soup.find('span',attrs={"class":'rating_nums'}) | |
print('----------') | |
print(live1) | |
print('----------') | |
print(live2) | |
return | |
# print() | |
if __name__ == '__main__': | |
getbifen() | |
# print('') | |
# headers2 = genersDicByTxt('header') | |
# r=requests.get('http://125.64.90.53:8888/credit/api/v1.0/commitment/back/book/delete?_source=manage',headers=headers2,data = {'id': 18467,'fileId': 25495}).json() | |
# print(r) |