想调用工信部的备案接口去批量判断下域名是否备案,发现有验证码,而工信部的验证码是特别恶心的,正确的输入都有可能错。于是想着可以调用接口获取验证码去查询。
我这里使用了 request、pytesseract
然后还有tesseract
brew install tesseract
代码
1 | import requests |
结果,默认识别率并不高,没一次是准的
于是换了网站—知网
import requests
import tesserocr
from PIL import Image
import time
import json
requests_session = requests.session()
def get_verify_code():
import requests
url = "http://my.cnki.net/elibregister/CheckCode.aspx"
querystring = {"id":(int(time.time())*1000)}
headers = {
'Accept': "image/webp,image/apng,image/*,*/*;q=0.8",
'Accept-Encoding': "gzip, deflate",
'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
'Cache-Control': "no-cache",
'Connection': "keep-alive",
'Cookie': "Ecp_ClientId=8180828132605418310; Ecp_IpLoginFail=18082859.111.198.102; ASP.NET_SessionId=lmkarndj230wvzo2n5ntzge0; SID=020102; ImageV=2QPV",
'DNT': "1",
'Host': "my.cnki.net",
'Pragma': "no-cache",
'Referer': "http://my.cnki.net/elibregister/commonRegister.aspx",
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
'Postman-Token': "7e66b937-40bd-4de8-a221-f8a2ef8db407"
}
response = requests.request("GET", url, headers=headers, params=json.dumps(querystring) )
response = requests_session.get(url, headers=headers)
if response.status_code == 200:
with open('code.jpg', 'wb') as file:
file.write(response.content)
image = Image.open('/Users/wenjun/PycharmProjects/get_domain_info/code.jpg')
time.sleep(5)
image = image.convert('L')
threshold = 127
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
image = image.point(table, '1')
image.show()
result = tesserocr.image_to_text(image)
print(result)
# return_code = pytesseract.image_to_string(img)
# print(return_code)
if __name__ == '__main__':
get_verify_code()
这个准确率还有点高,如图所示