Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/python
# coding:utf-8
import urllib
import re
import time
from bs4 import BeautifulSoup
import requests
import json
s = requests.session()
headers2 = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Content-Type': 'text/html; charset=gbk'
}
def genersDicByTxt(name):
inf = open(name + ".txt")
lines = inf.readlines()
dic = {}
for x in lines:
arr = x.split(': ')
dic[arr[0]] = arr[1].replace('\n','')
pass
inf.close()
return dic
# 根据url获取网页html内容
def getHtmlContent(url):
obj=s.get(url,headers=headers2)
# print(obj)
str = obj.content
# str.decode('gb2312')
return str
def getA(html):
ISOTIMEFORMAT='%Y-%m-%d %X'
aReg = re.compile(r'<a .+? href="(.+?.com.+?)>')
aArr = re.findall(aReg,html)
acount =1
for aurl in aArr:
print(aurl)
urllib.urlretrieve(aurl,filename='./%s %d.html' %(time.strftime( ISOTIMEFORMAT, time.localtime( time.time() ) ),acount
), reporthook=None, data=None)
# html1 = getHtmlContent(aurl)
# getA(html1)
acount=acount+1
return aArr
# 从html中解析出所有jpg图片的url
# 百度贴吧html中jpg图片的url格式为:<img ... src="XXX.jpg" width=...>
def getJPGs(html):
# 解析jpg图片url的正则
jpgReg = re.compile(r'<img class="lazy" src="(.+?\.jpeg)"') # 注:这里最后加一个'width'是为了提高匹配精确度
# 解析出jpg的url列表
jpgs = re.findall(jpgReg,html)
return jpgs
# 用图片url下载图片并保存成制定文件名
def downloadJPG(imgUrl,fileName):
urllib.urlretrieve(imgUrl,fileName)
# 批量下载图片,默认保存到当前目录下
def batchDownloadJPGs(imgUrls,path = './'):
# 用于给图片命名
count = 1
for url in imgUrls:
downloadJPG(url,''.join([path,'{0}.jpg'.format(count)]))
count = count + 1
# 去空格和换行
def clearSpace(astring):
return astring.strip().replace("\n", "")
# 封装:从百度贴吧网页下载图片
def download(url):
html = getHtmlContent(url)
arr=getA(html)
jpgs = getJPGs(html)
print(jpgs)
# batchDownloadJPGs(jpgs)
def main():
for x in xrange(1,4):
url = 'https://www.woyaogexing.com/tupian/z/zyx/'
download(url)
time.sleep(2)
pass
# 资讯
def getzixun():
for x in range(1,46):
soup =BeautifulSoup(getHtmlContent('http://maomijiaoyi.com/index.php?/xinwenliebiao_3_'+str(x)+'--15.html'),features="lxml")
lis = soup.findAll('li')
div = soup.find('div',attrs={"class":'zixun_left'})
zixun_items = div.findAll('a',attrs={"class":'zixun_item'})
print(x)
f1 = open("zixun" + '.txt','a')
for x in zixun_items:
adic = {}
img = 'http://maomijiaoyi.com'+ x.find('img').get('src')
zixun_title = clearSpace(x.find('div',attrs={"class":'zixun_title'}).get_text())
zixun_text = clearSpace(x.find('div',attrs={"class":'zixun_text'}).get_text())
zixun_time = clearSpace(x.find('div',attrs={"class":'zixun_time'}).get_text())
zixun_item = x.get('href')
adic["img"]=img
adic["href"]='http://maomijiaoyi.com'+zixun_item
adic["zixun_title"]=zixun_title
adic["zixun_text"]=zixun_text
adic["zixun_time"]=zixun_time
f1.write(json.dumps(adic)+'\n')
time.sleep(4)
# 爬取比分
def getbifen():
for x in range(1,5):
start = 20*x
url = 'https://www.douban.com/j/search?q=%E5%8A%A8%E6%BC%AB&start='+str(start)+'&subtype=item'
soup =BeautifulSoup(getHtmlContent(url),features="lxml")
live1 = soup.find('h3')
live2 = soup.find('span',attrs={"class":'rating_nums'})
print('----------')
print(live1)
print('----------')
print(live2)
return
# print()
if __name__ == '__main__':
getbifen()
# print('')
# headers2 = genersDicByTxt('header')
# r=requests.get('http://125.64.90.53:8888/credit/api/v1.0/commitment/back/book/delete?_source=manage',headers=headers2,data = {'id': 18467,'fileId': 25495}).json()
# print(r)