有些网站用JS 动态解密数据,生成页面,这种直接用Requests库获取数据有些麻烦,但是一向简单粗暴的Python为我们提供了selenium库,可以直接操作浏览器,可见就可得,下面将用一实例体现。
# from bs4 import BeautifulSoup from PIL import Image import requests,pymysql,time,re,random,configparser,os,datetime,sys from selenium import webdriver def trim(s): r = re.findall('[\S]+', s) return " ".join(r) class mytool: ip="127.0.0.1" user="admin" passwd="admin" database="yyy" idlist=[15,975,978,991,993,994,995,996,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020] alltypelist=[] curIndex=0 browser = None conn=None def __init__(self): self.conn = pymysql.connect(self.ip,self.user,self.passwd) self.alltypelist=self.getallservertype3() self.browser=webdriver.Edge() def __del__(self): self.conn.close() def getallservertype3(self): rt=[] self.conn.select_db(self.database) cur=self.conn.cursor()#获取游标 cur.execute("select * from yjcode_servertype WHERE admin=3") while 1: res=cur.fetchone() if res is None: #表示已经取完结果集 break rt.append(res) cur.close() self.conn.commit() return rt def getrandomuserid(self): i=random.randint(0,len(self.idlist)-1) return self.idlist[i] def getservertype12id(self,type1name,type2name): rt1=0 rt2=0 self.conn.select_db(self.database) cur=self.conn.cursor()#获取游标 cur.execute("select id from yjcode_servertype WHERE name1='"+type1name+"' AND name2='"+type2name +"' AND name3='' ; " ) while 1: res=cur.fetchone() if res is None: #表示已经取完结果集 break rt2=res[0] cur.close() self.conn.commit() cur=self.conn.cursor()#获取游标 cur.execute("select id from yjcode_servertype WHERE name1='"+type1name+"' AND name2='' AND name3='' ; " ) while 1: res=cur.fetchone() if res is None: #表示已经取完结果集 break rt1=res[0] cur.close() self.conn.commit() return (rt1,rt2) def checkisexist(self,tit): rt=0 self.conn.select_db(self.database) cur=self.conn.cursor()#获取游标 cur.execute("select id from yjcode_server WHERE tit='"+tit+"';") while 1: res=cur.fetchone() if res is None: #表示已经取完结果集 break rt=res[0] cur.close() self.conn.commit() return rt def insertServerdata(self,userid,bh,ty1id,ty2id,ty3id,tit,txt,money): if userid<1 or len(txt)<1 or len(tit)<1 or len(money)<1: print("parament is null") return False if self.checkisexist(tit)>0: print("is exist") return False dtime=str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) con = pymysql.connect(host=self.ip,database=self.database, charset='utf8',user=self.user, password=self.passwd) with con.cursor() as cursor: result = cursor.execute( 'INSERT INTO yjcode_server (userid,bh,ty1id,ty2id,ty3id,zt,sj,lastsj,tit,txt,xsnum,money1,ifxj) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s )' , (userid,bh,ty1id,ty2id,ty3id,'0',dtime,dtime,tit,txt,'0',money,'0') ) if result == 1: print(tit+'添加成功!') else: print("fail") con.commit() con.close() return True def getsrcname(self,src): new_imgscrobj=re.finditer(r"/[A-Za-z0-9]+\.(gif|png|jpg|bmp)$",src) new_imgscr="" for n in new_imgscrobj: new_imgscr=str(n.group()) return new_imgscr def getburimg(self,filepath) : soup = BeautifulSoup(open(filepath,'r',encoding='UTF-8')) #print (soup.prettify()) huzhanurl=soup.find('a',id='huzhanurl').get_text() os.system("start D:\\Users\\aihao\\webimg.exe "+huzhanurl) def downloadimg(self,imgsrc,savepath): #print(imgsrc,savepath) r=requests.get(url=imgsrc) with open(savepath, 'wb') as file: file.write(r.content) print('dwonlad img complete') time.sleep(5) #file.close() def createthumbnail(self,imgsrc): newimg1=imgsrc.replace(".","-1.") print(sys.platform) if sys.platform.find("win")>=0: sysstr="copy "+imgsrc+" "+newimg1 os.system(sysstr) elif sys.platform.find("linux")>=0: sysstr="cp "+imgsrc+" "+newimg1 os.system(sysstr) im=Image.open(imgsrc) imgsize=im.size imgth=0.3 im.thumbnail((imgsize[0]*imgth,imgsize[1]*imgth)) #print(im.format,im.size,im.mode) newimg2=imgsrc.replace(".","-2.") im.save(newimg2,'JPEG') def downloadpage(self,index): #https://www.xxx.com/code/page/21 print(index) os.system("start msedge https://www.xxx.com/code/page/"+str(index)) def office368(self,pagestr,userid): #办公采购 #https://www.xxx.com/category.php?id=67 thisurl="https://www.xxx.com/category.php?id="+str(pagestr) r=requests.get(url=thisurl) r.encoding='utf-8' #print(r.text) list_id=[] list_img=[] m_index=0 soup = BeautifulSoup(r.text,'html.parser') ul=soup.find('ul',class_='list-grid clearfix') if not ul: print("ul error ") return lis=ul.find_all('li',class_='item') for li in lis: href=str(li.find('a').get('href')) list_id.append(href) imgsrc=str(li.find('img').get('data-original')) list_img.append(imgsrc) #print(href,imgsrc) #breakIndentationError: unindent does not match any outer indentation level for viewid in list_id: m_index+=1 goodsidurl="https://www.xxx.com/"+viewid thisurl=goodsidurl r=requests.get(goodsidurl) r.encoding='utf-8' time.sleep(3) print( time.asctime( time.localtime(time.time()) )) soup = BeautifulSoup(r.text,'html.parser') tit=soup.find('div',class_='goods-name').get_text() #print(tit) #print("..") money=str(soup.find('font',class_='market-price').get_text()).replace("¥","").replace(",","") #print(money) #print(tit.get_text()) #txt_o=soup.find('div',id='bqdiv1') #imgs=txt_o.find_all('img') #for img in imgs: # imgscr=str(img.get('src')) # if len(imgscr)>1 and -1==imgscr.find('http') : # img['src']="https://www.xxx.com"+imgscr txt=str(soup.find('div',class_='right-con')).replace('阳光易购',"") #time.sleep(10000000) if(len(money)>1 and len(tit)>1 and len(txt)>1 ): i=list_id.index(viewid) t_imgsrc=list_img[i] k=t_imgsrc.rfind('/')+1 #https://img.xxx.com/imgextra/i4/682114580/TB1IIsEXyqAXuNjy1XdXXaYcVXa_!!0-item_pic.jpg_300x300.jpg bhname=t_imgsrc[k:].replace('-','').replace('_!!',"").replace(".jpg_","").replace(".png_","") bh=str(int(time.time()))+"-"+str(userid); xsnum=0 if(1==random.randint(0,50)): xsnum=random.randint(1,10) url = "http://xxxx/zhf/savedata.php" data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname,"ty1id":39,"ty2id":0,"ty3id":0} res = requests.post(url=url,data=data) print(res.text) if(res.text.find("success")>1): f_path='d:\\upload\\bh\\'+bhname inc=0 while((not os.path.isfile(f_path)) and inc<3 ): inc+=1 if not re.match(r"^http",t_imgsrc): t_imgsrc="https://www.xxx.com"+t_imgsrc print("download img->"+t_imgsrc+" -> "+f_path) self.downloadimg(t_imgsrc,f_path) time.sleep(5) if os.path.isfile(f_path): self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh) time.sleep(5) # os.rename(filepath,filepath.replace(".html",".xxxx")) elif (res.text.find("isexist!")>=0): print(thisurl) if m_index>3: print("next page") time.sleep(5) break def uploadfile(self,filename,filePath,userid,bh) : url="http://xxx/zhf/uploadfile.php" files = {'file': (filename, open(filePath, 'rb'),'image/jpeg')} data={"userid":userid,"bh":bh} r= requests.post(url, data=data, files=files) #print(requests.Request('POST', url,files=files).prepare().body.decode('ascii')) # 鎵撳嵃锟????????????锟藉悕鍜岋拷??????????????????? print(r.text) def ai_getty1id(self,tit): if re.search(r'(网站模板)|(网站源码)',tit): return 37 elif re.search(r'(品牌设)',tit): return 152 def haozhan(self,page,userid): #http://www.xxx.com/code?page=1355 url="http://www.xxx.com/code?page="+str(page) list_id=[] list_img=[] r=requests.get(url) r.encoding='utf-8' try: soup = BeautifulSoup(r.text,'html.parser') div=soup.find('div',class_="list_items" ) #print (div.get_text()) dts=div.find_all('dt') for dt in dts: a=dt.find('a') #print(a.get('href')) list_id.append(str(a.get('href'))) #print(bhname) #imga=ul.find('img') #list_img.append(str(imga.get('src'))) #print(imga) #print(imga.get('src')) #print("-->"+li.get_text()) except AttributeError: pass for viewid in list_id: #print(goodsidurl) try: goodsidurl="http://www.xxx.com"+viewid #print(goodsidurl) r=requests.get(goodsidurl) r.encoding='utf-8' time.sleep(3) print( time.asctime( time.localtime(time.time()) )) soup = BeautifulSoup(r.text,'html.parser') thisurl=goodsidurl tit=soup.find('span',class_='cate').get_text() print("-->") print(tit) time.sleep(180) break #print("..") money=str(soup.find('span',id='nowmoney').get_text()) #print(money) #print(tit.get_text()) txt_o=soup.find('div',id='bqdiv1') imgs=txt_o.find_all('img') for img in imgs: imgscr=str(img.get('src')) if len(imgscr)>1 and -1==imgscr.find('http') : img['src']="https://www.xxx.com"+imgscr txt=str(soup.find('div',id='bqdiv1')) #time.sleep(10000000) if(len(money)>1 and len(tit)>1 and len(txt)>1 ): i=list_id.index(viewid) t_imgsrc=list_img[i] k=t_imgsrc.rfind('/')+1 bhname=t_imgsrc[k:].replace('-','') bh=str(int(time.time()))+"-"+str(userid); xsnum=0 if(1==random.randint(0,50)): xsnum=random.randint(1,10) url = "http://xxx/zhf/savedata.php" data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname} if len(txt)>1: res = requests.post(url=url,data=data) print(res.text) if(res.text.find("success")>1): f_path='d:\\upload\\bh\\'+bhname if not os.path.isfile(f_path): print("download img->"+t_imgsrc+" -> "+f_path) self.downloadimg(t_imgsrc,f_path) time.sleep(5) self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh) time.sleep(5) # os.rename(filepath,filepath.replace(".html",".xxxx")) elif (res.text.find("isexist!")>=0): print(thisurl) else: print("txt is null") except AttributeError: pass except FileNotFoundError: pass except BaseException: pass def zhisu(self,page,userid): #https://www.xxx.com/product/search_j1v_p1v.html url="https://www.xxx.com/product/search_j1v_p"+str(page)+"v.html" list_id=[] list_img=[] r=requests.get(url) r.encoding='gb2312' try: soup = BeautifulSoup(r.text,'html.parser') div=soup.find('div',class_="biglist" ) #print (div.get_text()) uls=div.find_all('ul',class_='u1') for ul in uls: a=ul.find('a') #print(a.get('href')) list_id.append(str(a.get('href'))) #print(bhname) imga=ul.find('img') list_img.append(str(imga.get('src'))) #print(imga) #print(imga.get('src')) #print("-->"+li.get_text()) except AttributeError: pass for viewid in list_id: #print(goodsidurl) try: goodsidurl="https://www.xxx.com"+viewid #print(goodsidurl) r=requests.get(goodsidurl) r.encoding='gb2312' time.sleep(3) print( time.asctime( time.localtime(time.time()) )) soup = BeautifulSoup(r.text,'html.parser') thisurl=goodsidurl tit_p=soup.find('div',id='jbmiddle') tit=tit_p.find('h1').get_text() #print(tit) #print("..") money=str(soup.find('span',id='nowmoney').get_text()) #print(money) #print(tit.get_text()) txt_o=soup.find('div',id='bqdiv1') imgs=txt_o.find_all('img') for img in imgs: imgscr=str(img.get('src')) if len(imgscr)>1 and -1==imgscr.find('http') : img['src']="https://www.xxx.com"+imgscr txt=str(soup.find('div',id='bqdiv1')) #time.sleep(10000000) if(len(money)>1 and len(tit)>1 and len(txt)>1 ): i=list_id.index(viewid) t_imgsrc=list_img[i] k=t_imgsrc.rfind('/')+1 bhname=t_imgsrc[k:].replace('-','') bh=str(int(time.time()))+"-"+str(userid); xsnum=0 if(1==random.randint(0,50)): xsnum=random.randint(1,10) url = "http://xxx/zhf/savedata.php" data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname} if len(txt)>1: res = requests.post(url=url,data=data) print(res.text) if(res.text.find("success")>1): f_path='d:\\upload\\bh\\'+bhname inc=0 while((not os.path.isfile(f_path)) and inc<3 ): inc+=1 print("download img->"+t_imgsrc+" -> "+f_path) self.downloadimg(t_imgsrc,f_path) time.sleep(5) if os.path.isfile(f_path): self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh) time.sleep(5) # os.rename(filepath,filepath.replace(".html",".xxxx")) elif (res.text.find("isexist!")>=0): print(thisurl) else: print("txt is null") except AttributeError: pass except FileNotFoundError: pass except BaseException: pass def w87zx(self,page,userid): #https://www.xxx.com/product/search_j1v_p1v.html url="https://www.xxx.com/product/search_j1v_p"+str(page)+"v.html" list_id=[] list_img=[] r=requests.get(url) r.encoding='gb2312' try: soup = BeautifulSoup(r.text,'html.parser') div=soup.find('div',class_="biglist" ) #print (div.get_text()) uls=div.find_all('ul',class_='u1') for ul in uls: a=ul.find('a') #print(a.get('href')) list_id.append(str(a.get('href'))) #print(bhname) imga=ul.find('img') list_img.append(str(imga.get('src'))) #print(imga) #print(imga.get('src')) #print("-->"+li.get_text()) except AttributeError: print("AttributeError") for viewid in list_id: #print(goodsidurl) try: goodsidurl="https://www.xxx.com/product/"+viewid r=requests.get(goodsidurl) r.encoding='gb2312' time.sleep(3) print( time.asctime( time.localtime(time.time()) )) soup = BeautifulSoup(r.text,'html.parser') thisurl=goodsidurl tit_p=soup.find('div',id='jbmiddle') tit=tit_p.find('h1').get_text() #print(tit) #print("..") money=str(soup.find('span',id='nowmoney').get_text()) #print(money) txt_o=soup.find('div',id='bqdiv1') imgs=txt_o.find_all('img') for img in imgs: imgscr=str(img.get('src')) if len(imgscr)>1 and -1==imgscr.find('http') : img['src']="https://www.xxx.com"+imgscr txt=str(soup.find('div',id='bqdiv1')) #time.sleep(10000000) if(len(money)>1 and len(tit)>1 and len(txt)>1 ): i=list_id.index(viewid) t_imgsrc=list_img[i] k=t_imgsrc.rfind('/')+1 bhname=t_imgsrc[k:].replace('-','') bh=str(int(time.time()))+"-"+str(userid); xsnum=0 if(1==random.randint(0,50)): xsnum=random.randint(1,10) url = "http://xxx/zhf/savedata.php" data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname} if len(txt)>1: res = requests.post(url=url,data=data) print(res.text) if(res.text.find("success")>1): f_path='d://upload//bh//'+bhname inc=0 while((not os.path.isfile(f_path)) and inc<3 ): inc+=1 print("download img->"+t_imgsrc+" -> "+f_path) self.downloadimg(t_imgsrc,f_path) time.sleep(5) if os.path.isfile(f_path): self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh) time.sleep(5) # os.rename(filepath,filepath.replace(".html",".xxxx")) elif (res.text.find("isexist!")>=0): print(thisurl) else: print("txt is null") except AttributeError: print("AttributeError "+viewid) except FileNotFoundError: print("FileNotFoundError "+viewid) except BaseException: print("BaseException "+viewid) def suibianlu_s(self,page,userid): url="https://www.xxx.com/code_"+str(page) list_id=[] list_img=[] r=requests.get(url) r.encoding='utf-8' try: soup = BeautifulSoup(r.text,'html.parser') lis=soup.find_all('li',class_="clearfix" ) for li in lis: #time.sleep(1) #print(li.find('a').get('href')) goodsid=li.find('a').get('href') if len(goodsid): list_id.append(goodsid) s=goodsid.split('/') bhn=(s[len(s)-1]).replace(".html","") imgsrc=li.find('img').get('src') if len(imgsrc): list_img.append(imgsrc) #print(bhname) #print("-->"+li.get_text()) except AttributeError: pass for goodsidurl in list_id: #print(goodsidurl) try: r=requests.get(goodsidurl) r.encoding='utf-8' time.sleep(3) print( time.asctime( time.localtime(time.time()) )) soup = BeautifulSoup(r.text,'html.parser') thisurl=goodsidurl bh=str(int(time.time()))+"-"+str(userid); tit=soup.find('h1',class_="i30 mb5").get_text() money=str(random.randint(10,30)*100) #print(tit.get_text()) txt=str(soup.find('div',class_='info-con')) #time.sleep(10000000) if(len(money)>1 and len(tit)>1 and len(txt)>1 ): i=list_id.index(goodsidurl) #s=thisurl.split('/') t_imgsrc=list_img[i] s=thisurl.split('/') bhn=(s[len(s)-1]).replace(".html","") imgsuffix=t_imgsrc.split(".") bhname=bhn+"."+imgsuffix[len(imgsuffix)-1] xsnum=0 if(1==random.randint(0,50)): xsnum=random.randint(1,10) url = "http://xxx/zhf/savedata.php" data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname} if len(txt)>1: res = requests.post(url=url,data=data) print(res.text) if(res.text.find("success")>1): f_path='d://upload//bh//'+bhname if not os.path.isfile(f_path): print("download img->"+t_imgsrc+" -> "+f_path) with open(f_path,'wb') as f: img=requests.get(t_imgsrc) # f.write(img.content) # f.close() time.sleep(5) self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh) time.sleep(5) # os.rename(filepath,filepath.replace(".html",".xxxx")) elif (res.text.find("isexist!")>=0): print(thisurl) else: print("txt is null") except AttributeError: pass except FileNotFoundError: pass except BaseException: pass def upload(self,filepath,userid): #FILE_OBJECT= open('order.log','r', encoding='UTF-8') soup = BeautifulSoup(open(filepath,'r',encoding='UTF-8')) #print (soup.prettify()) thisurl=soup.find('a',id='thiseurl').get_text() bh= soup.find('a',id='bh').get_text()+"-"+str(userid) bhname=soup.find('a',id='bh_name').get_text() #burl="../upload/code/"+nbh; #print(bh) #ty1id=soup.find('a',id='ty1').get_text() #ty2id=soup.find('a',id='ty2').get_text() tit=soup.find('a',id='tit').get_text() money=str(soup.find('h1',id='money').get_text()).replace(',','') #txt=str(soup.find('div',id='txt')); # for (var i=0;i<document.getElementsByClassName('c_r_des lazyload').valueOf(0)[0].getElementsByTagName('img').length;i++){ #var imgi=document.getElementsByClassName('c_r_des lazyload').valueOf(0)[0].getElementsByTagName('img').valueOf(0)[i].src #alert(imgi) #} imgs=soup.find_all('img',id="c_pic") for img in imgs: imgscr=str(img.get('src')) imglayscr=img.get('lay-src') if len(imglayscr)>1: img['src']="../upload/code"+self.getsrcname(imglayscr) img['lay-src']="../upload/code"+self.getsrcname(imglayscr) else: img['src']="../upload/code"+self.getsrcname(imgscr) txt=str(soup.find('div',id='txt')) #print (txt ) #http://xx/savedata.php if(len(money)>1 and len(tit)>1 and len(txt)>1 ): print("---->" + time.asctime( time.localtime(time.time()) )) url = "http://xx/zhf/savedata.php" data = {"thisurl":thisurl,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname} if len(txt)>1: res = requests.post(url=url,data=data) print(res.text) if(res.text.find("success")>1): self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh) time.sleep(10) # os.rename(filepath,filepath.replace(".html",".xxxx")) elif (res.text.find("isexist!")>=0): pass try : os.rename(filepath,filepath.replace(".html",".xxxx")) except FileExistsError: os.remove(filepath) #os.system("start D:\\Users\\aihao\\webimg.exe "+huzhanurl) else: print("txt is null") def auto_huzan(self): uploaddir="D:\\Users\\aihao\\UPLOAD GOODS" downloaddir="D:\\Users\\msi\\Downloads" vbpagedir="D:\\Users\\administrator\\documents\\visual studio 2015\\Projects\\webimg\\webimg\\bin\\Debug\\2020622" p_dir=vbpagedir dirs = os.listdir(p_dir ) index=0 for file in dirs: index+=1 print("progress"+str(index)+"/"+str(len(dirs))) f_p=os.path.join(p_dir,file) if os.path.isfile(f_p): if file.find(".html")>1: #print(file) try: #getburimg(f_p) self.upload(f_p,975); #break time.sleep(5) pass except UnicodeDecodeError: print(f_p) def auto_suibianlu_s(self): maxpage=224 cdir=os.path.abspath(os.path.dirname(__file__)) file=cdir+"\\pyconfig.ini" config = configparser.ConfigParser() config.read(file) pagestr = int(config.get('page','start2') ) print("start in ->"+str(pagestr)) while 1: i=random.randint(0,len(self.idlist)-1) userid=self.idlist[i] if pagestr>maxpage: print("complete") break print ("current page is:"+str(pagestr)+" "+ time.asctime( time.localtime(time.time()) )) time.sleep(5) self.suibianlu_s(pagestr,userid) pagestr+=1 config.set('page','start2', str(pagestr +1)) with open(file,'w') as configfile: config.write(configfile) def auto_w87zx(self): #w87zx(1,991) maxpage=1444 cdir=os.path.abspath(os.path.dirname(__file__)) file=cdir+"\\pyconfig.ini" config = configparser.ConfigParser() config.read(file) pagestr = int(config.get('page','start3') ) print("start in ->"+str(pagestr)) while True: i=random.randint(0,len(self.idlist)-1) userid=self.idlist[i] if pagestr>maxpage: print("complete") break print ("current page is:"+str(pagestr)+" "+ time.asctime( time.localtime(time.time()) )) time.sleep(5) self.w87zx(pagestr,userid) pagestr+=1 config.set('page','start3', str(pagestr +1)) with open(file,'w') as configfile: config.write(configfile) def auto_zhisu(self): maxpage=1316 cdir=os.path.abspath(os.path.dirname(__file__)) file=cdir+"\\pyconfig.ini" config = configparser.ConfigParser() config.read(file) pagestr = int(config.get('page','start4') ) print("start in ->"+str(pagestr)) while True: i=random.randint(0,len(self.idlist)-1) userid=self.idlist[i] if pagestr>maxpage: print("complete") break print ("current page is:"+str(pagestr)+" "+ time.asctime( time.localtime(time.time()) )) time.sleep(5) self.zhisu(pagestr,userid) pagestr+=1 config.set('page','start4', str(pagestr +1)) with open(file,'w') as configfile: config.write(configfile) def auto_haozan(self): maxpage=1355 cdir=os.path.abspath(os.path.dirname(__file__)) file=cdir+"\\pyconfig.ini" config = configparser.ConfigParser() config.read(file) pagestr = int(config.get('page','start5') ) print("start in ->"+str(pagestr)) while 1: i=random.randint(0,len(self.idlist)-1) userid=self.idlist[i] if pagestr>maxpage: print("complete") break print ("current page is:"+str(pagestr)+" "+ time.asctime( time.localtime(time.time()) )) time.sleep(5) self.haozhan(pagestr,userid) pagestr+=1 config.set('page','start5', str(pagestr +1)) with open(file,'w') as configfile: config.write(configfile) def auto_office368(self): #office368(2,975) maxpage=999 cdir=os.path.abspath(os.path.dirname(__file__)) file=cdir+"\\pyconfig.ini" config = configparser.ConfigParser() config.read(file) pagestr = int(config.get('page','start6') ) print("start in ->"+str(pagestr)) while True: i=random.randint(0,len(self.idlist)-1) userid=self.idlist[i] if pagestr>maxpage: print("complete") break print ("current page is:"+str(pagestr)+" "+ time.asctime( time.localtime(time.time()) )) time.sleep(5) self.office368(pagestr,userid) pagestr+=1 config.set('page','start6', str(pagestr +1)) with open(file,'w') as configfile: config.write(configfile) def zbj_seach(self): a3=self.alltypelist[0] id12=self.getservertype12id(a3[3],a3[4]) id3=a3[0] name3=a3[8] userid=975 #print(id12[0],id12[1],id3,name3) searchkey=name3 url="https://xxx.zbj.com/search/f/?kw="+searchkey r=requests.get(url) soup = BeautifulSoup(r.text,'html.parser') divlistbox=soup.find('div',class_="witkey-list-grid j-service-provider-wrap") #print(divlistbox) a_s=divlistbox.find_all("a",class_='desc',target="_blank"); alist=[] for a in a_s: a_href=a.get("href"); if a_href not in alist: #print(a_href) if not re.match(r'^http',a_href): a_href="https:"+a_href; alist.append(a_href) r2=requests.get(url=a_href); soup2= BeautifulSoup(r2.text,"html.parser") tit=str(soup2.find('h2',class_="yahei").get_text()) #print(tit) bh_imgsrc=soup2.find("img",class_="service-case-img").get('src') #print(bh_imgsrc) money=str(soup2.find("span",class_="price").get_text()) #print(money) txt=str(soup2.find(id="J-description")) #print(txt) bh=str(int(time.time()))+"-"+str(userid); #print(userid,bh,id12[0],id12[1],id3,tit,txt,money,money) self.insertServerdata(userid,bh,id12[0],id12[1],id3,tit,txt,money) break def web_scrollTo_end(self,tm=5): js = "return action=document.body.scrollHeight" # 初始化现在滚动条所在高度为0 height = 0 # 当前窗口总高度 new_height = self.browser.execute_script(js) while height < new_height: # 将滚动条调整至页面底部 for i in range(height, new_height, 100): self.browser.execute_script('window.scrollTo(0, {})'.format(i)) time.sleep(0.3) height = new_height time.sleep(0.3) new_height = self.browser.execute_script(js) def gettitissexist(self,tit): data={"tit":tit} url="http://xxx/zhf/savedata.php?method=gettitisexist" r=requests.post(url,data) if r.text.find("1")>=0: return True else: return False def huzan2(self,page): #System.setProperty("webdriver.Edge.driver", "D:\\Program Files\\Python36\\Scripts\\msedgedriver.exe"); url="https://www.xxx.com/code/page/"+str(page) self.browser.get(url) self.browser.maximize_window() #self.browser.minimize_window() #self.web_scrollTo_end(1) html = BeautifulSoup(self.browser.page_source, "html.parser") list_items=html.find("div",class_="list_items") if list_items: lias= list_items.find_all("a",class_="pic") for a in lias: #print("slp 10 s") #time.sleep(10) userid=self.getrandomuserid() aurl="https://www.xxx.com"+a.get("href") thisurl=aurl self.browser.get(aurl) html2 = BeautifulSoup(self.browser.page_source, "html.parser") if html2.find("div",class_="layui-layer-btn"): print("is error product") continue if html2.find("div",class_="layui-layer-content"): self.browser.execute_script("document.getElementsByClassName('layui-layer-ico layui-layer-close layui-layer-close2').valueOf()[0].click()") tit=trim(html2.find("div",class_="c_g_tit").get_text()) #标题 print(tit) if self.gettitissexist(tit): print("is exist000") continue titimg=self.browser.find_element_by_class_name("G-image") titimgnamesrc=titimg.get_attribute("src") i=titimgnamesrc.rfind('/') titimgnam=titimgnamesrc[i:].replace("/","") bhimgname=re.sub(r"(.jpg)|(.png)|(\.*)","",titimgnam)+".png" #展示图 print(bhimgname) dirp=os.path.join("D:\\upload\\bh",bhimgname) titimg.screenshot(dirp) #保存bhimg time.sleep(2) price=html2.find("div",class_="price").get_text() money=re.sub(r"¥|,|,","",price) #价格 #print(money) ysweb="" c_g_spe=html2.find("a",class_="demo") if c_g_spe: ysweb=c_g_spe.get("href") else: print("no ywsem continue") continue #演示网址 #print(ysweb) #self.web_scrollTo_end(1) spatt=html2.find("ul",class_="c_r_par") lis=spatt.find_all("li") tysx="" for li in lis: #cite=li.find("cite").get_text() em=li.find("em").get_text() em=re.sub(r'(\(.*\))|(\+.*)|(.*)|(其他)|(无)',"",em) tysx+=str(em)+"," #描述属性 #self.browser.execute_script("document.body.style.zoom='0.8'") self.web_scrollTo_end(1) self.browser.execute_script("if(document.getElementsByClassName('c_r_menu fixed').length){document.getElementsByClassName('c_r_menu fixed').valueOf()[0].innerText=''}") #print(tysx) ishidden="<a id='delatttrue' hidde></a>" txt=html2.find("div",id="c_aa") cpics=self.browser.find_elements_by_id("c_pic") drt=str(datetime.datetime.now().strftime("%Y%m%d")) ddrt="D:\\upload\\code"+drt if not os.path.isdir(ddrt): os.mkdir(ddrt) cpicnamelist=[] for cpic in cpics: cpici=cpic.get_attribute("src") i=cpici.rfind('/')+1 cpciname=cpici[i:] cpciname=re.sub(r"(.jpg)|(.png)|(\.*)","",cpciname)+".png" cpicnamelist.append(cpciname) #print(cpciname) # dtime= cpic.screenshot(ddrt+"\\"+cpciname) time.sleep(2.5) imgs=txt.find_all("img") index=0 for img in imgs: del img['lay-src'] img['src']="/upload/code"+drt+"/"+ cpicnamelist[index] index+=1 #删除img #img.decompose() txt= ishidden+str(txt) #商品描述 #print(txt) if( len(money)>1 and len(tit)>1 and len(txt)>1 ): bhname=bhimgname bh=str(int(time.time()))+"-"+str(userid); xsnum=0 if(1==random.randint(0,50)): xsnum=random.randint(1,10) url = "http://xxx/zhf/savedata.php" data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname,"ysweb":ysweb,"tysx":tysx} if len(txt)>1: res = requests.post(url=url,data=data) print("requests--->"+res.text) if(res.text.find("success")>1): f_path='d:\\upload\\bh\\'+bhname if os.path.isfile(f_path): self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh) time.sleep(5) # os.rename(filepath,filepath.replace(".html",".xxxx")) elif (res.text.find("isexist001!")>=0): print(thisurl) else: print("txt is null") print("slp 5 s") time.sleep(5) def auto_huzan2(self): cdir=os.path.abspath(os.path.dirname(__file__)) file=cdir+"\\pyconfig.ini" config = configparser.ConfigParser() config.read(file) pagestr = int(config.get('page','start8') ) maxpage=9999 #print("start in ->"+str(pagestr)) while 1: if pagestr>maxpage: print("complete") break print("-------------------------------------------------------------------------------------------") print ("current page is:"+str(pagestr)+" "+ time.asctime( time.localtime(time.time()) ),"slp 5 s") time.sleep(5) self.huzan2(pagestr) pagestr+=1 config.set('page','start8', str(pagestr+1)) with open(file,'w') as configfile: config.write(configfile) if __name__ == "__main__": cfun = mytool() func=8 try: if 0: cfun.uploadfile("1563516747922.png","d:\\upload\\bh\\1563516747922.png",15,'1592801704-975') elif 0: url="https://www.xxx.com/upload/26076/1589429413-26076/0034519001589429461tp26076-1.jpg" f_path="C:\\Users\\msi\\Desktop\\picture.jpg" cfun.downloadimg(url,f_path) cfun.createthumbnail(f_path) elif 0: #suibianlu_s(105,975) pass elif 8==func: cfun.auto_huzan2() pass elif 7==func: #print(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) cfun.zbj_seach() elif 6==func: #办公采购 cfun.auto_office368() elif 5==func: #haozhan pass #cfun.auto_haozan() elif 4==func: cfun.auto_zhisu() elif 3==func: cfun.auto_w87zx() elif 2==func: cfun.auto_suibianlu_s() elif 1==func: #huzan cfun.auto_huzan() else: print("please select func") except BaseException: pass