python 识别验证码

想调用工信部的备案接口去批量判断下域名是否备案,发现有验证码,而工信部的验证码是特别恶心的,正确的输入都有可能错。于是想着可以调用接口获取验证码去查询。

我这里使用了 request、pytesseract
然后还有tesseract

brew install  tesseract

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import requests
import pytesseract
from PIL import Image
import json

requests_session = requests.session()


def get_verify_code():

url = "http://www.miitbeian.gov.cn/getVerifyCode?50"

headers = {
'Accept': "image/webp,image/apng,image/*,*/*;q=0.8",
'Accept-Encoding': "gzip, deflate",
'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
'Cache-Control': "no-cache",
'Connection': "keep-alive",
'DNT': "1",
'Host': "www.miitbeian.gov.cn",
'Pragma': "no-cache",
'Referer': "http://www.miitbeian.gov.cn/icp/publish/query/icpMemoInfo_showPage.action",
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}

response = requests_session.get(url, headers=headers)
if response.status_code == 200:
with open('code.jpg', 'wb') as file:
file.write(response.content)
img = Image.open('/Users/wenjun/PycharmProjects/get_domain_info/code.jpg')
return_code = pytesseract.image_to_string(img)
print(return_code)

if __name__ == '__main__':

get_verify_code()

结果,默认识别率并不高,没一次是准的

于是换了网站—知网

import requests
import tesserocr
from PIL import Image
import time
import json

requests_session = requests.session()


def get_verify_code():
    import requests

    url = "http://my.cnki.net/elibregister/CheckCode.aspx"

    querystring = {"id":(int(time.time())*1000)}

    headers = {
        'Accept': "image/webp,image/apng,image/*,*/*;q=0.8",
        'Accept-Encoding': "gzip, deflate",
        'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
        'Cache-Control': "no-cache",
        'Connection': "keep-alive",
        'Cookie': "Ecp_ClientId=8180828132605418310; Ecp_IpLoginFail=18082859.111.198.102; ASP.NET_SessionId=lmkarndj230wvzo2n5ntzge0; SID=020102; ImageV=2QPV",
        'DNT': "1",
        'Host': "my.cnki.net",
        'Pragma': "no-cache",
        'Referer': "http://my.cnki.net/elibregister/commonRegister.aspx",
        'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
        'Postman-Token': "7e66b937-40bd-4de8-a221-f8a2ef8db407"
    }

    response = requests.request("GET", url, headers=headers, params=json.dumps(querystring) )

    response = requests_session.get(url, headers=headers)
    if response.status_code == 200:
        with open('code.jpg', 'wb') as file:
            file.write(response.content)
        image = Image.open('/Users/wenjun/PycharmProjects/get_domain_info/code.jpg')
        time.sleep(5)
        image = image.convert('L')
        threshold = 127
        table = []
        for i in range(256):
            if i < threshold:
                table.append(0)
            else:
                table.append(1)

        image = image.point(table, '1')
        image.show()
        result = tesserocr.image_to_text(image)
        print(result)

        # return_code = pytesseract.image_to_string(img)
        # print(return_code)

if __name__ == '__main__':

    get_verify_code()

这个准确率还有点高,如图所示