share-image
ESC

python 识别验证码

想调用工信部的备案接口去批量判断下域名是否备案,发现有验证码,而工信部的验证码是特别恶心的,正确的输入都有可能错。于是想着可以调用接口获取验证码去查询。

我这里使用了 request、pytesseract
然后还有tesseract

brew install  tesseract

代码

import requests
import pytesseract
from PIL import Image
import json

requests_session = requests.session()


def get_verify_code():

url = "http://www.miitbeian.gov.cn/getVerifyCode?50"

headers = {
'Accept': "image/webp,image/apng,image/*,*/*;q=0.8",
'Accept-Encoding': "gzip, deflate",
'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
'Cache-Control': "no-cache",
'Connection': "keep-alive",
'DNT': "1",
'Host': "www.miitbeian.gov.cn",
'Pragma': "no-cache",
'Referer': "http://www.miitbeian.gov.cn/icp/publish/query/icpMemoInfo_showPage.action",
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}

response = requests_session.get(url, headers=headers)
if response.status_code == 200:
with open('code.jpg', 'wb') as file:
file.write(response.content)
img = Image.open('/Users/wenjun/PycharmProjects/get_domain_info/code.jpg')
return_code = pytesseract.image_to_string(img)
print(return_code)

if __name__ == '__main__':

get_verify_code()

结果,默认识别率并不高,没一次是准的

于是换了网站—知网

import requests
import tesserocr
from PIL import Image
import time
import json

requests_session = requests.session()


def get_verify_code():
import requests

url = "http://my.cnki.net/elibregister/CheckCode.aspx"

querystring = {"id":(int(time.time())*1000)}

headers = {
'Accept': "image/webp,image/apng,image/*,*/*;q=0.8",
'Accept-Encoding': "gzip, deflate",
'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
'Cache-Control': "no-cache",
'Connection': "keep-alive",
'Cookie': "Ecp_ClientId=8180828132605418310; Ecp_IpLoginFail=18082859.111.198.102; ASP.NET_SessionId=lmkarndj230wvzo2n5ntzge0; SID=020102; ImageV=2QPV",
'DNT': "1",
'Host': "my.cnki.net",
'Pragma': "no-cache",
'Referer': "http://my.cnki.net/elibregister/commonRegister.aspx",
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
'Postman-Token': "7e66b937-40bd-4de8-a221-f8a2ef8db407"
}

response = requests.request("GET", url, headers=headers, params=json.dumps(querystring) )

response = requests_session.get(url, headers=headers)
if response.status_code == 200:
with open('code.jpg', 'wb') as file:
file.write(response.content)
image = Image.open('/Users/wenjun/PycharmProjects/get_domain_info/code.jpg')
time.sleep(5)
image = image.convert('L')
threshold = 127
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)

image = image.point(table, '1')
image.show()
result = tesserocr.image_to_text(image)
print(result)

# return_code = pytesseract.image_to_string(img)
# print(return_code)

if __name__ == '__main__':

get_verify_code()

这个准确率还有点高,如图所示

文章作者:阿文
文章链接: https://www.awen.me/post/9831.html
版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 阿文的博客
本文于 2018-08-27 发布,已超过半年(2712天),请注意甄别内容是否已过期。