import requests import json import time from openpyxl import Workbook
def checkarea(area):
N = ["京", "津", "冀", "晋", "蒙", "辽", "吉", "黑", "陕", "甘", "青", "宁", "新"] E = ["沪", "苏", "浙", "皖", "赣", "鲁", "豫", "闽"] S = ["粤", "桂", "琼", "鄂", "湘", "渝", "蜀", "黔", "滇"] if area in N: return "N" if area in E: return "E" if area in S: return "S"
def CheckDomain():
wb = Workbook() ws = wb.active ws.title = "数据" ws['A1'] = "域名" ws["B1"] = "企业/个人" ws["C1"] = "名称" ws["D1"] = "主页地址" ws["E1"] = "网站名称" ws["F1"] = "备案号" ws["G1"] = "地区"
with open("source.txt") as f: while 1: lines = f.readlines(100000) if not lines: break for line in lines: api = "http://www.sojson.com/api/beian/" headers = {"User-Agent":'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36','cache-control':'no-cache'} try: r = requests.get(api+line.strip('\n'),headers=headers) except (requests.ConnectionError, IndexError, UnicodeEncodeError,TimeoutError): print("请求异常,无法连接服务器") except requests.HTTPError as f: print('请求异常,HTTP错误') finally: info = r.json() if info['type'] == 200: print(info) getnature = info['nature'] getdomain = info['domain'].strip() print(getdomain)
if getnature =="个人": getname = "个人" elif getnature =="企业": getname = info['name'] else: getname = info['name']
getnoicp = info['nowIcp'] getarea = checkarea(info['nowIcp'][0]) getindexurl = info['indexUrl'] getsitename = info['sitename'] if getarea == "N": N = "华北" ws.append([getdomain,getnature,getname, getindexurl,getsitename,getnoicp,N]) if getarea =="E": E = "华东" ws.append([getdomain, getnature, getname, getindexurl, getsitename, getnoicp,E]) if getarea == "S": S = "华南" ws.append([getdomain,getnature,getname,getindexurl,getsitename,getnoicp,S]) else: ws.append([line,'无备案信息' ,'' ,'','' ]) wb.save("客户信息1-2.xlsx") time.sleep(3) wb.close() print("Job Done!")
CheckDomain()
|