求
import requests
url = "www.baidu.com"
resp = requests.get(url)
htmls = resp.text
beautifulsoup系列
from bs4 import BeautifulSoup
soup = BeautifulSoup(htmls, "lxml")
soup.find("a",class_="title",id="t1",attrs={"alog-action": "qb-ask-uname"}))
soup.find("div").get_text()
str(soup.find("div").get_text()).strip()
for i in soup.find_all("div",limit = 5)
print(i.get_text())
正則系列
rollback({ "response": { "code": "0", "msg": "Success", "dext": "" }, "data": { "count": 3, "page": 1, "article_info": [{ "title": "“小庫里”:適應比賽是首要任務 投籃終會找到節(jié)奏", "url": "http:\/\/sports.qq.com\/a\/20180704\/035378.htm", "time": "2018-07-04 16:58:36", "column": "NBA", "img": "", "desc": "" }, { "title": "首鋼體育助力國家冰球集訓隊 中國冰球聯(lián)賽年底啟動", "url": "http:\/\/sports.qq.com\/a\/20180704\/034698.htm", "time": "2018-07-04 16:34:44", "column": "綜合體育", "img": "", "desc": "" }...] } }) import re # 提取這個json中的每條新聞的title、url #(.*?)為要提取的內(nèi)容,可以在正則字符串中加入.*?表示中間省略若干字符 reg_str = r'"title":"(.*?)",.*?"url":"(.*?)"' pattern = re.compile(reg_str,re.DOTALL) items = re.findall(pattern,htmls) for i in items: tilte = i[0] url = i[1]
過濾html標簽,保留標簽里的內(nèi)容
import re
htmls = "<p>abc</p>"
dr = re.compile(r'<[^>]+>',re.S)
htmls2 = dr.sub('',htmls)
print(htmls2) #abc
過濾script和style標簽,標簽里的內(nèi)容也需過濾掉
import requests
from bs4 import BeautifulSoup
url = "http://new.qq.com/omn/20180705/20180705A0920X.html"
r = requests.get(url)
htmls = r.text
soup = BeautifulSoup(htmls, "lxml")
for script in soup(["script", "style"]):
script.extract()
print(soup)
日期、時間的處理
import datetime
import time
# 獲取當前年月日
today = datetime.date.today()
print(today) #2018-07-05
# 獲取當前時間并格式化
time_now = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
print(time_now) #2018-07-05 14:20:55
# 對某個時間戳a格式化
a = 1502691655
time_a = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(a)))
print(time_a) #2017-08-14 14:20:55
#時間的計算
#明天的日期
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
print(tomorrow) #2018-07-06
#三天前的時間
today = datetime.datetime.today()
tomorrow = today + datetime.timedelta(days=-3)
print(tomorrow) #2018-07-02 13:37:00.107703
#計算時間差
start = "2018-07-03 00:00:00"
time_now = datetime.datetime.now()
b = datetime.datetime.strptime(start,'%Y-%m-%d %H:%M:%S')
minutes = (time_now-b).seconds/60
days = (time_now-b).days
all_minutes = days*24*60+minutes
print(minutes) #821.7666666666667
print(days) #2
print(all_minutes) #3701.7666666666664
base64編碼與解碼
import base64 content = "abc124我是" contents_base64 = base64.b64encode(content.encode('utf-8','ignore')).decode("utf-8") contents = base64.b64decode(contents_base64) url中的中文解碼 import urllib url = "www.baidu.com?wb =%e8%85" result_url = urllib.parse.unquote(soup3)
天小編就為大家分享一篇關于Python常用爬蟲代碼總結方便查詢,覺得內(nèi)容挺不錯的,現(xiàn)在分享給大家,具有很好的參考價值,需要的朋友一起跟隨小編來看看吧
1、beautifulsoup解析頁面
from bs4 import BeautifulSoup
soup = BeautifulSoup(htmltxt, "lxml")
# 三種裝載器
soup = BeautifulSoup("<a></p>", "html.parser")
### 只有起始標簽的會自動補全,只有結束標簽的會自動忽略
### 結果為:<a></a>
soup = BeautifulSoup("<a></p>", "lxml")
### 結果為:<html><body><a></a></body></html>
soup = BeautifulSoup("<a></p>", "html5lib")
### html5lib則出現(xiàn)一般的標簽都會自動補全
### 結果為:<html><head></head><body><a><p></p></a></body></html>
# 根據(jù)標簽名、id、class、屬性等查找標簽
### 根據(jù)class、id、以及屬性alog-action的值和標簽類別查詢
soup.find("a",class_="title",id="t1",attrs={"alog-action": "qb-ask-uname"}))
### 查詢標簽內(nèi)某屬性的值
pubtime = soup.find("meta",attrs={"itemprop":"datePublished"}).attrs['content']
### 獲取所有class為title的標簽
for i in soup.find_all(class_="title"):
print(i.get_text())
### 獲取特定數(shù)量的class為title的標簽
for i in soup.find_all(class_="title",limit = 2):
print(i.get_text())
### 獲取文本內(nèi)容時可以指定不同標簽之間的分隔符,也可以選擇是否去掉前后的空白。
soup = BeautifulSoup('<p class="title" id="p1"><b> The Dormouses story </b></p><p class="title" id="p1"><b>The Dormouses story</b></p>', "html5lib")
soup.find(class_="title").get_text("|", strip=True)
#結果為:The Dormouses story|The Dormouses story
### 獲取class為title的p標簽的id
soup.find(class_="title").get("id")
### 對class名稱正則:
soup.find_all(class_=re.compile("tit"))
### recursive參數(shù),recursive=False時,只find當前標簽的第一級子標簽的數(shù)據(jù)
soup = BeautifulSoup('<html><head><title>abc','lxml')
soup.html.find_all("title", recursive=False)
2、unicode編碼轉(zhuǎn)中文
content = "\u65f6\u75c7\u5b85"
content = content.encode("utf8","ignore").decode('unicode_escape')
3、url encode的解碼與解碼
from urllib import parse
# 編碼
x = "中國你好"
y = parse.quote(x)
print(y)
# 解碼
x = parse.unquote(y)
print(x)
4、html轉(zhuǎn)義字符的解碼
from html.parser import HTMLParser
htmls = "<div><p>"
txt = HTMLParser().unescape(htmls)
print(txt) . # 輸出<div><p>
5、base64的編碼與解碼
import base64
# 編碼
content = "測試轉(zhuǎn)碼文本123"
contents_base64 = base64.b64encode(content.encode('utf-8','ignore')).decode("utf-8")
# 解碼
contents = base64.b64decode(contents_base64)
6、過濾emoji表情
def filter_emoji(desstr,restr=''):
try:
co = re.compile(u'[U00010000-U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub(restr, desstr)
7、完全過濾script和style標簽
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(htmls, "lxml")
for script in soup(["script", "style"]):
script.extract()
print(soup)
8、過濾html的標簽,但保留標簽里的內(nèi)容
import re
htmls = "<p>abc</p>"
dr = re.compile(r'<[^>]+>',re.S)
htmls2 = dr.sub('',htmls)
print(htmls2) #abc
正則提取內(nèi)容(一般處理json)
rollback({
"response": {
"code": "0",
"msg": "Success",
"dext": ""
},
"data": {
"count": 3,
"page": 1,
"article_info": [{
"title": "“小庫里”:適應比賽是首要任務 投籃終會找到節(jié)奏",
"url": "http://sports.qq.com/a/20180704/035378.htm",
"time": "2018-07-04 16:58:36",
"column": "NBA",
"img": "",
"desc": ""
}, {
"title": "首鋼體育助力國家冰球集訓隊 中國冰球聯(lián)賽年底啟動",
"url": "http://sports.qq.com/a/20180704/034698.htm",
"time": "2018-07-04 16:34:44",
"column": "綜合體育",
"img": "",
"desc": ""
}...]
}
})
import re
# 提取這個json中的每條新聞的title、url
# (.*?)為要提取的內(nèi)容,可以在正則字符串中加入.*?表示中間省略若干字符
reg_str = r'"title":"(.*?)",.*?"url":"(.*?)"'
pattern = re.compile(reg_str,re.DOTALL)
items = re.findall(pattern,htmls)
for i in items:
tilte = i[0]
url = i[1]
9、時間操作
# 獲取當前日期
today = datetime.date.today()
print(today) #2018-07-05
# 獲取當前時間并格式化
time_now = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
print(time_now) #2018-07-05 14:20:55
# 對時間戳格式化
a = 1502691655
time_a = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(a)))
print(time_a) #2017-08-14 14:20:55
# 字符串轉(zhuǎn)為datetime類型
str = "2018-07-01 00:00:00"
datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")
# 將時間轉(zhuǎn)化為時間戳
time_line = "2018-07-16 10:38:50"
time_tuple = time.strptime(time_line, "%Y-%m-%d %H:%M:%S")
time_line2 = int(time.mktime(time_tuple))
# 明天的日期
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
print(tomorrow) #2018-07-06
# 三天前的時間
today = datetime.datetime.today()
tomorrow = today + datetime.timedelta(days=-3)
print(tomorrow) #2018-07-02 13:37:00.107703
# 計算時間差
start = "2018-07-03 00:00:00"
time_now = datetime.datetime.now()
b = datetime.datetime.strptime(start,'%Y-%m-%d %H:%M:%S')
minutes = (time_now-b).seconds/60
days = (time_now-b).days
all_minutes = days*24*60+minutes
print(minutes) #821.7666666666667
print(days) #2
print(all_minutes) #3701.7666666666664
10、數(shù)據(jù)庫操作
import pymysql
conn = pymysql.connect(host='10.0.8.81', port=3306, user='root', passwd='root',db='xxx', charset='utf8')
cur = conn.cursor()
insert_sql = "insert into tbl_name(id,name,age) values(%s,%s,%s)
id = 1
name = "like"
age = 26
data_list = []
data = (id,name,age)
# 單條插入
cur.execute(insert_sql,data)
conn.commit()
# 批量插入
data_list.append(data)
cur.executemany(insert_sql,data_list)
conn.commit()
#特殊字符處理(name中含有特殊字符)
data = (id,pymysql.escape_string(name),age)
#更新
update_sql = "update tbl_name set content = '%s' where id = "+str(id)
cur.execute(update_sql%(pymysql.escape_string(content)))
conn.commit()
#批量更新
update_sql = "UPDATE tbl_recieve SET content = %s ,title = %s , is_spider = %s WHERE id = %s"
update_data = (contents,title,is_spider,one_new[0])
update_data_list.append(update_data)
if len(update_data_list) > 500:
try:
cur.executemany(update_sql,update_data_list)
conn.commit()
以上就是小編今天為大家總結的一些Python常用的爬蟲代碼。
器之心編譯
參與:Ellen Han、吳攀
在深度學習中,循環(huán)神經(jīng)網(wǎng)絡(RNN)是一系列善于從序列數(shù)據(jù)中學習的神經(jīng)網(wǎng)絡。由于對長期依賴問題的魯棒性,長短期記憶(LSTM)是一類已經(jīng)有實際應用的循環(huán)神經(jīng)網(wǎng)絡。現(xiàn)在已有大量關于LSTM的文章和文獻,其中推薦如下兩篇:
Goodfellow et.al. 《深度學習》一書第十章:http://www.deeplearningbook.org/
Chris Olah:理解 LSTM:http://colah.github.io/posts/2015-08-Understanding-LSTMs/
已存在大量優(yōu)秀的庫可以幫助你基于LSTM構建機器學習應用。在GitHub中,谷歌的TensorFlow在此文成文時已有超過 50000 次星,表明了其在機器學習從業(yè)者中的流行度。
與此形成對比,相對缺乏的似乎是關于如何基于LSTM建立易于理解的TensorFlow應用的優(yōu)秀文檔和示例,這也是本文嘗試解決的問題。
假設我們想用一個樣本短故事來訓練LSTM預測下一個單詞,伊索寓言:
long ago , the mice had a general council to consider what measures they could take to outwit their common enemy , the cat . some said this , and some said that but at last a young mouse got up and said he had a proposal to make , which he thought would meet the case . you will all agree , said he , that our chief danger consists in the sly and treacherous manner in which the enemy approaches us . now , if we could receive some signal of her approach , we could easily escape from her . i venture , therefore , to propose that a small bell be procured , and attached by a ribbon round the neck of the cat . by this means we should always know when she was about , and could easily retire while she was in the neighbourhood . this proposal met with general applause , until an old mouse got up and said that is all very well , but who is to bell the cat ? the mice looked at one another and nobody spoke . then the old mouse said it is easy to propose impossible remedies .
表1.取自伊索寓言的短故事,其中有112個不同的符號。單詞和標點符號都視作符號。
如果我們將文本中的3個符號以正確的序列輸入LSTM,以1個標記了的符號作為輸出,最終神經(jīng)網(wǎng)絡將學會正確地預測下一個符號(Figure1)。
圖 1.有3個輸入和1個輸出的LSTM單元
嚴格說來,LSTM只能理解輸入的實數(shù)。一種將符號轉(zhuǎn)化為數(shù)字的方法是基于每個符號出現(xiàn)的頻率為其分配一個對應的整數(shù)。例如,上面的短文中有112個不同的符號。如列表2所示的函數(shù)建立了一個有如下條目 [ “,” : 0 ] [ “the” : 1 ], …, [ “council” : 37 ],…,[ “spoke” = 111 ]的詞典。而為了解碼LSTM的輸出,同時也生成了逆序字典。
build_dataset(words):
表 2.建立字典和逆序字典的函數(shù)
類似地,預測值也是一個唯一的整數(shù)值與逆序字典中預測符號的索引相對應。例如:如果預測值是37,預測符號便是“council”。
輸出的生成看起來似乎簡單,但實際上LSTM為下一個符號生成了一個含有112個元素的預測概率向量,并用softmax()函數(shù)歸一化。有著最高概率值的元素的索引便是逆序字典中預測符號的索引值(例如:一個 one-hot 向量)。圖2 給出了這個過程。
圖2.每一個輸入符號被分配給它的獨一無二的整數(shù)值所替代。輸出是一個表明了預測符號在反向詞典中索引的 one-hot 向量。
LSTM模型是這個應用的核心部分。令人驚訝的是,它很易于用TensorFlow實現(xiàn):
def RNN(x, weights, biases):
# reshape to [1, n_input]
x = tf.reshape(x, [-1, n_input])
# Generate a n_input-element sequence of inputs
# (eg. [had] [a] [general] → [20] [6] [33])
x = tf.split(x,n_input,1)
# 1-layer LSTM with n_hidden units.
rnn_cell = rnn.BasicLSTMCell(n_hidden)
# generate prediction
outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)
# there are n_input outputs but
# we only want the last output
return tf.matmul(outputs[-1], weights['out']) + biases['out']
表3.有512個LSTM 單元的網(wǎng)絡模型
最難部分是以正確的格式和順序完成輸入。在這個例子中,LSTM的輸入是一個有3個整數(shù)的序列(例如:1x3 的整數(shù)向量)
網(wǎng)絡的常量、權值和偏差設置如下:
vocab_size = len(dictionary)
表4.常量和訓練參數(shù)
訓練過程中的每一步,3個符號都在訓練數(shù)據(jù)中被檢索。然后3個符號轉(zhuǎn)化為整數(shù)以形成輸入向量。
symbols_in_keys = [ [dictionary[ str(training_data[i])]] for i in range(offset, offset+n_input) ]
表 5.將符號轉(zhuǎn)化為整數(shù)向量作為輸入
訓練標簽是一個位于3個輸入符號之后的 one-hot 向量
symbols_out_onehot = np.zeros([vocab_size], dtype=float)
表6.單向量作為標簽
在轉(zhuǎn)化為輸入詞典的格式后,進行如下的優(yōu)化過程:
_, acc, loss, onehot_pred = session.run([optimizer, accuracy, cost, pred], feed_dict={x: symbols_in_keys, y: symbols_out_onehot})
表 7.訓練過程中的優(yōu)化
精度和損失被累積以監(jiān)測訓練過程。通常50,000次迭代足以達到可接受的精度要求。
...
表 8.一個訓練間隔的預測和精度數(shù)據(jù)示例(間隔1000步)
代價是標簽和softmax()預測之間的交叉熵,它被RMSProp以 0.001的學習率進行優(yōu)化。在本文示例的情況中,RMSProp通常比Adam和SGD表現(xiàn)得更好。
pred = RNN(x, weights, biases)
表 9.損失和優(yōu)化器
LSTM的精度可以通過增加層來改善。
rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden),rnn.BasicLSTMCell(n_hidden)])
Listing 10. 改善的LSTM
現(xiàn)在,到了有意思的部分。讓我們通過將預測得到的輸出作為輸入中的下一個符號輸入LSTM來生成一個故事吧。示例輸入是“had a general”,LSTM給出了正確的輸出預測“council”。然后“council”作為新的輸入“a general council”的一部分輸入神經(jīng)網(wǎng)絡得到下一個輸出“to”,如此循環(huán)下去。令人驚訝的是,LSTM創(chuàng)作出了一個有一定含義的故事。
had a general council to consider what measures they could take to outwit their common enemy , the cat . some said this , and some said that but at last a young mouse got
表11.截取了樣本故事生成的故事中的前32個預測值
如果我們輸入另一個序列(例如:“mouse”, “mouse”, “mouse”)但并不一定是這個故事中的序列,那么會自動生成另一個故事。
mouse mouse mouse , neighbourhood and could receive a outwit always the neck of the cat . some said this , and some said that but at last a young mouse got up and said
表 12.并非來源于示例故事中的輸入序列
示例代碼可以在這里找到:https://github.com/roatienza/Deep-Learning-Experiments/blob/master/Experiments/Tensorflow/RNN/rnn_words.py
示例文本的鏈接在這里:https://github.com/roatienza/Deep-Learning-Experiments/blob/master/Experiments/Tensorflow/RNN/belling_the_cat.txt
小貼士:
用整數(shù)值編碼符號容易操作但會丟失單詞的意思。本文中將符號轉(zhuǎn)化為整數(shù)值是用來簡化關于用TensorFlow建立LSTM應用的討論的。更推薦采用Word2Vec將符號編碼為向量。
將輸出表達成單向量是效率較低的方式,尤其當我們有一個現(xiàn)實的單詞量大小時。牛津詞典有超過170,000個單詞,而上面的例子中只有112個單詞。再次聲明,本文中的示例只為了簡化討論。
這里采用的代碼受到了Tensorflow-Examples的啟發(fā):https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py
本文例子中的輸入大小為3,看一看當采用其它大小的輸入時會發(fā)生什么吧(例如:4,5或更多)。
每次運行代碼都可能生成不同的結果,LSTM的預測能力也會不同。這是由于精度依賴于初始參數(shù)的隨機設定。訓練次數(shù)越多(超過150,000次)精度也會相應提高。每次運行代碼,建立的詞典也會不同
Tensorboard在調(diào)試中,尤其當檢查代碼是否正確地建立了圖時很有用。
試著用另一個故事測試LSTM,尤其是用另一種語言寫的故事。
*請認真填寫需求信息,我們會在24小時內(nèi)與您取得聯(lián)系。