soft

soft

python 调用webdriver 爬取JS动态解密加载的网页数据

        有些网站用JS 动态解密数据,生成页面,这种直接用Requests库获取数据有些麻烦,但是一向简单粗暴的Python为我们提供了selenium库,可以直接操作浏览器,可见就可得,下面将用一实例体现。

#  
from bs4 import BeautifulSoup
from PIL import Image
import requests,pymysql,time,re,random,configparser,os,datetime,sys
from selenium import webdriver


def trim(s):
    r = re.findall('[\S]+', s)
    return " ".join(r)

class mytool:
    ip="127.0.0.1"
    user="admin"
    passwd="admin"
    database="yyy"
    idlist=[15,975,978,991,993,994,995,996,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020]
    alltypelist=[]
    curIndex=0
    browser =  None
    conn=None
    def __init__(self):
        self.conn = pymysql.connect(self.ip,self.user,self.passwd)
        self.alltypelist=self.getallservertype3()
        self.browser=webdriver.Edge()        
    def __del__(self):
        self.conn.close()    
    def getallservertype3(self):  
        rt=[]        
        self.conn.select_db(self.database)
        cur=self.conn.cursor()#获取游标
        cur.execute("select * from yjcode_servertype WHERE admin=3")
        while 1:
            res=cur.fetchone()
            if res is None:
                #表示已经取完结果集
                break
            rt.append(res)
        cur.close()
        self.conn.commit()
        
        return rt
    def getrandomuserid(self):
        i=random.randint(0,len(self.idlist)-1)
        return self.idlist[i]          
    def getservertype12id(self,type1name,type2name):
        rt1=0
        rt2=0     
       
        self.conn.select_db(self.database)      

        cur=self.conn.cursor()#获取游标
        cur.execute("select id from yjcode_servertype WHERE name1='"+type1name+"' AND name2='"+type2name +"' AND name3='' ; " )
        while 1:
            res=cur.fetchone()
            if res is None:
                #表示已经取完结果集
                break
            rt2=res[0]
        cur.close()
        self.conn.commit()

        cur=self.conn.cursor()#获取游标
        cur.execute("select id from yjcode_servertype WHERE name1='"+type1name+"' AND name2='' AND name3='' ; " )
        while 1:
            res=cur.fetchone()
            if res is None:
                #表示已经取完结果集
                break
            rt1=res[0]
        cur.close()
        self.conn.commit()


        
        return (rt1,rt2)

    def checkisexist(self,tit):
        rt=0
        
        self.conn.select_db(self.database)
        cur=self.conn.cursor()#获取游标
        cur.execute("select id from yjcode_server WHERE tit='"+tit+"';")
        while 1:
            res=cur.fetchone()
            if res is None:
                #表示已经取完结果集
                break
            rt=res[0] 
        cur.close()
        self.conn.commit()
       
        return rt
    def insertServerdata(self,userid,bh,ty1id,ty2id,ty3id,tit,txt,money):      
        if userid<1 or len(txt)<1 or len(tit)<1 or len(money)<1:
            print("parament is null")
            return False
        if self.checkisexist(tit)>0:
            print("is exist")
            return False
        dtime=str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        con = pymysql.connect(host=self.ip,database=self.database, charset='utf8',user=self.user, password=self.passwd)
               
 
        with con.cursor() as cursor:               
            result = cursor.execute(
                'INSERT INTO yjcode_server (userid,bh,ty1id,ty2id,ty3id,zt,sj,lastsj,tit,txt,xsnum,money1,ifxj) \
                                    VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s )' ,
                (userid,bh,ty1id,ty2id,ty3id,'0',dtime,dtime,tit,txt,'0',money,'0')
            )                
        if result == 1:
            print(tit+'添加成功!')   
        else:
            print("fail")            
        con.commit()
    
        
    
        con.close()
        return True
    
    def getsrcname(self,src):
        new_imgscrobj=re.finditer(r"/[A-Za-z0-9]+\.(gif|png|jpg|bmp)$",src)
        new_imgscr=""
        for n in new_imgscrobj:
            new_imgscr=str(n.group())
        return  new_imgscr
    def getburimg(self,filepath) :
        soup = BeautifulSoup(open(filepath,'r',encoding='UTF-8'))
        #print (soup.prettify())
        huzhanurl=soup.find('a',id='huzhanurl').get_text()
        os.system("start D:\\Users\\aihao\\webimg.exe "+huzhanurl)
    def downloadimg(self,imgsrc,savepath):
        #print(imgsrc,savepath)
        r=requests.get(url=imgsrc)
        with open(savepath, 'wb') as file:
            file.write(r.content)
        print('dwonlad img complete')
        time.sleep(5)
        #file.close()
    def createthumbnail(self,imgsrc):
        newimg1=imgsrc.replace(".","-1.")
        print(sys.platform)
        if sys.platform.find("win")>=0:
            sysstr="copy "+imgsrc+" "+newimg1
            os.system(sysstr)
        elif  sys.platform.find("linux")>=0:
            sysstr="cp "+imgsrc+" "+newimg1
            os.system(sysstr)    
        im=Image.open(imgsrc)   
        imgsize=im.size 
        imgth=0.3    
        im.thumbnail((imgsize[0]*imgth,imgsize[1]*imgth))
        #print(im.format,im.size,im.mode)
        newimg2=imgsrc.replace(".","-2.")
        im.save(newimg2,'JPEG')
    def downloadpage(self,index):
        #https://www.xxx.com/code/page/21
        print(index)
        os.system("start msedge https://www.xxx.com/code/page/"+str(index))

    def office368(self,pagestr,userid):
        #办公采购
        #https://www.xxx.com/category.php?id=67
        thisurl="https://www.xxx.com/category.php?id="+str(pagestr)
        r=requests.get(url=thisurl)
        r.encoding='utf-8'
        #print(r.text)
        list_id=[]
        list_img=[]
        m_index=0
        soup = BeautifulSoup(r.text,'html.parser')
        ul=soup.find('ul',class_='list-grid clearfix')
        if not ul:
            print("ul error ")
            return
        lis=ul.find_all('li',class_='item')
        for li in lis:
            href=str(li.find('a').get('href'))
            list_id.append(href)
            imgsrc=str(li.find('img').get('data-original'))
            list_img.append(imgsrc)
            #print(href,imgsrc)
            #breakIndentationError: unindent does not match any outer indentation level
        for viewid in list_id:
            m_index+=1            
            goodsidurl="https://www.xxx.com/"+viewid
            thisurl=goodsidurl
            r=requests.get(goodsidurl)
            r.encoding='utf-8'
            time.sleep(3)
            print( time.asctime( time.localtime(time.time()) ))
            soup = BeautifulSoup(r.text,'html.parser')

            tit=soup.find('div',class_='goods-name').get_text()
            #print(tit)
            #print("..")
            money=str(soup.find('font',class_='market-price').get_text()).replace("¥","").replace(",","")
            #print(money)
            #print(tit.get_text())
            #txt_o=soup.find('div',id='bqdiv1')
            #imgs=txt_o.find_all('img')
            #for img in imgs:
            #    imgscr=str(img.get('src'))
            #    if len(imgscr)>1 and -1==imgscr.find('http') :
            #        img['src']="https://www.xxx.com"+imgscr

            txt=str(soup.find('div',class_='right-con')).replace('阳光易购',"")

            #time.sleep(10000000)
            if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
                i=list_id.index(viewid)
                t_imgsrc=list_img[i]
                k=t_imgsrc.rfind('/')+1
                #https://img.xxx.com/imgextra/i4/682114580/TB1IIsEXyqAXuNjy1XdXXaYcVXa_!!0-item_pic.jpg_300x300.jpg
                bhname=t_imgsrc[k:].replace('-','').replace('_!!',"").replace(".jpg_","").replace(".png_","")
                bh=str(int(time.time()))+"-"+str(userid);
                xsnum=0
                if(1==random.randint(0,50)):
                    xsnum=random.randint(1,10)
                url = "http://xxxx/zhf/savedata.php"
                data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname,"ty1id":39,"ty2id":0,"ty3id":0}
                res = requests.post(url=url,data=data)
                print(res.text)
                if(res.text.find("success")>1):
                    f_path='d:\\upload\\bh\\'+bhname
                    inc=0
                    while((not os.path.isfile(f_path)) and inc<3 ):
                        inc+=1
                        if not re.match(r"^http",t_imgsrc):
                            t_imgsrc="https://www.xxx.com"+t_imgsrc
                        print("download img->"+t_imgsrc+" -> "+f_path)
                        self.downloadimg(t_imgsrc,f_path)
                        time.sleep(5)
                    if  os.path.isfile(f_path):
                        self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                    time.sleep(5)

                    # os.rename(filepath,filepath.replace(".html",".xxxx"))
                elif (res.text.find("isexist!")>=0):
                    print(thisurl)

                if m_index>3:
                    print("next page")
                    time.sleep(5)
                    break


    def uploadfile(self,filename,filePath,userid,bh) :
        url="http://xxx/zhf/uploadfile.php"
        files = {'file': (filename, open(filePath, 'rb'),'image/jpeg')}
        data={"userid":userid,"bh":bh}
        r= requests.post(url, data=data, files=files)
        #print(requests.Request('POST', url,files=files).prepare().body.decode('ascii'))  # 鎵撳嵃锟????????????锟藉悕鍜岋拷???????????????????
        print(r.text)

    def ai_getty1id(self,tit):     
        if re.search(r'(网站模板)|(网站源码)',tit):
            return 37
        elif re.search(r'(品牌设)',tit):
            return 152    


    def haozhan(self,page,userid):
        #http://www.xxx.com/code?page=1355
        url="http://www.xxx.com/code?page="+str(page)
        list_id=[]
        list_img=[]
        r=requests.get(url)
        r.encoding='utf-8'
        try:
            soup = BeautifulSoup(r.text,'html.parser')
            div=soup.find('div',class_="list_items" )
            #print (div.get_text())
            dts=div.find_all('dt')
            for dt in dts:
                a=dt.find('a')
                #print(a.get('href'))
                list_id.append(str(a.get('href')))
                #print(bhname)
                #imga=ul.find('img')
                #list_img.append(str(imga.get('src')))
                #print(imga)
                #print(imga.get('src'))

                #print("-->"+li.get_text())
        except AttributeError:
            pass

        for viewid in list_id:
            #print(goodsidurl)
            try:
                goodsidurl="http://www.xxx.com"+viewid
                #print(goodsidurl)
                r=requests.get(goodsidurl)
                r.encoding='utf-8'
                time.sleep(3)
                print( time.asctime( time.localtime(time.time()) ))
                soup = BeautifulSoup(r.text,'html.parser')
                thisurl=goodsidurl


                tit=soup.find('span',class_='cate').get_text()

                print("-->")
                print(tit)
                time.sleep(180)

                break
                #print("..")
                money=str(soup.find('span',id='nowmoney').get_text())
                #print(money)
                #print(tit.get_text())
                txt_o=soup.find('div',id='bqdiv1')
                imgs=txt_o.find_all('img')
                for img in imgs:
                    imgscr=str(img.get('src'))
                    if len(imgscr)>1 and -1==imgscr.find('http') :
                        img['src']="https://www.xxx.com"+imgscr

                txt=str(soup.find('div',id='bqdiv1'))

                #time.sleep(10000000)
                if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
                    i=list_id.index(viewid)
                    t_imgsrc=list_img[i]
                    k=t_imgsrc.rfind('/')+1
                    bhname=t_imgsrc[k:].replace('-','')
                    bh=str(int(time.time()))+"-"+str(userid);
                    xsnum=0
                    if(1==random.randint(0,50)):
                        xsnum=random.randint(1,10)
                    url = "http://xxx/zhf/savedata.php"
                    data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
                    if len(txt)>1:
                        res = requests.post(url=url,data=data)
                        print(res.text)
                        if(res.text.find("success")>1):
                            f_path='d:\\upload\\bh\\'+bhname
                            if not os.path.isfile(f_path):
                                print("download img->"+t_imgsrc+" -> "+f_path)
                                self.downloadimg(t_imgsrc,f_path)
                                time.sleep(5)
                            self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                            time.sleep(5)

                            # os.rename(filepath,filepath.replace(".html",".xxxx"))
                        elif (res.text.find("isexist!")>=0):
                            print(thisurl)
                    else:
                        print("txt is null")
            except AttributeError:
                pass
            except FileNotFoundError:
                pass
            except BaseException:
                pass

    def zhisu(self,page,userid):
        #https://www.xxx.com/product/search_j1v_p1v.html
        url="https://www.xxx.com/product/search_j1v_p"+str(page)+"v.html"
        list_id=[]
        list_img=[]
        r=requests.get(url)
        r.encoding='gb2312'
        try:
            soup = BeautifulSoup(r.text,'html.parser')
            div=soup.find('div',class_="biglist" )
            #print (div.get_text())
            uls=div.find_all('ul',class_='u1')
            for ul in uls:
                a=ul.find('a')
                #print(a.get('href'))
                list_id.append(str(a.get('href')))
                #print(bhname)
                imga=ul.find('img')
                list_img.append(str(imga.get('src')))
                #print(imga)
                #print(imga.get('src'))

                #print("-->"+li.get_text())
        except AttributeError:
            pass

        for viewid in list_id:
            #print(goodsidurl)
            try:
                goodsidurl="https://www.xxx.com"+viewid
                #print(goodsidurl)
                r=requests.get(goodsidurl)
                r.encoding='gb2312'
                time.sleep(3)
                print( time.asctime( time.localtime(time.time()) ))
                soup = BeautifulSoup(r.text,'html.parser')
                thisurl=goodsidurl

                tit_p=soup.find('div',id='jbmiddle')
                tit=tit_p.find('h1').get_text()
                #print(tit)
                #print("..")
                money=str(soup.find('span',id='nowmoney').get_text())
                #print(money)
                #print(tit.get_text())
                txt_o=soup.find('div',id='bqdiv1')
                imgs=txt_o.find_all('img')
                for img in imgs:
                    imgscr=str(img.get('src'))
                    if len(imgscr)>1 and -1==imgscr.find('http') :
                        img['src']="https://www.xxx.com"+imgscr

                txt=str(soup.find('div',id='bqdiv1'))

                #time.sleep(10000000)
                if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
                    i=list_id.index(viewid)
                    t_imgsrc=list_img[i]
                    k=t_imgsrc.rfind('/')+1
                    bhname=t_imgsrc[k:].replace('-','')
                    bh=str(int(time.time()))+"-"+str(userid);
                    xsnum=0
                    if(1==random.randint(0,50)):
                        xsnum=random.randint(1,10)
                    url = "http://xxx/zhf/savedata.php"
                    data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
                    if len(txt)>1:
                        res = requests.post(url=url,data=data)
                        print(res.text)
                        if(res.text.find("success")>1):
                            f_path='d:\\upload\\bh\\'+bhname
                            inc=0
                            while((not os.path.isfile(f_path)) and inc<3 ):
                                inc+=1
                                print("download img->"+t_imgsrc+" -> "+f_path)
                                self.downloadimg(t_imgsrc,f_path)
                                time.sleep(5)
                            if  os.path.isfile(f_path):
                                self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                            time.sleep(5)

                            # os.rename(filepath,filepath.replace(".html",".xxxx"))
                        elif (res.text.find("isexist!")>=0):
                            print(thisurl)
                    else:
                        print("txt is null")
            except AttributeError:
                pass
            except FileNotFoundError:
                pass
            except BaseException:
                pass

    def w87zx(self,page,userid):
        #https://www.xxx.com/product/search_j1v_p1v.html
        url="https://www.xxx.com/product/search_j1v_p"+str(page)+"v.html"
        list_id=[]
        list_img=[]
        r=requests.get(url)
        r.encoding='gb2312'
        try:
            soup = BeautifulSoup(r.text,'html.parser')
            div=soup.find('div',class_="biglist" )
            #print (div.get_text())
            uls=div.find_all('ul',class_='u1')
            for ul in uls:
                a=ul.find('a')
                #print(a.get('href'))
                list_id.append(str(a.get('href')))
                #print(bhname)
                imga=ul.find('img')
                list_img.append(str(imga.get('src')))
                #print(imga)
                #print(imga.get('src'))

                #print("-->"+li.get_text())
        except AttributeError:
            print("AttributeError")

        for viewid in list_id:
            #print(goodsidurl)
            try:
                goodsidurl="https://www.xxx.com/product/"+viewid
                r=requests.get(goodsidurl)
                r.encoding='gb2312'
                time.sleep(3)
                print( time.asctime( time.localtime(time.time()) ))
                soup = BeautifulSoup(r.text,'html.parser')
                thisurl=goodsidurl

                tit_p=soup.find('div',id='jbmiddle')
                tit=tit_p.find('h1').get_text()
                #print(tit)
                #print("..")
                money=str(soup.find('span',id='nowmoney').get_text())
                #print(money)
                txt_o=soup.find('div',id='bqdiv1')
                imgs=txt_o.find_all('img')
                for img in imgs:
                    imgscr=str(img.get('src'))
                    if len(imgscr)>1 and -1==imgscr.find('http') :
                        img['src']="https://www.xxx.com"+imgscr
                txt=str(soup.find('div',id='bqdiv1'))
                #time.sleep(10000000)
                if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
                    i=list_id.index(viewid)
                    t_imgsrc=list_img[i]
                    k=t_imgsrc.rfind('/')+1
                    bhname=t_imgsrc[k:].replace('-','')
                    bh=str(int(time.time()))+"-"+str(userid);
                    xsnum=0
                    if(1==random.randint(0,50)):
                        xsnum=random.randint(1,10)
                    url = "http://xxx/zhf/savedata.php"
                    data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
                    if len(txt)>1:
                        res = requests.post(url=url,data=data)
                        print(res.text)
                        if(res.text.find("success")>1):
                            f_path='d://upload//bh//'+bhname
                            inc=0
                            while((not os.path.isfile(f_path)) and inc<3 ):
                                inc+=1
                                print("download img->"+t_imgsrc+" -> "+f_path)
                                self.downloadimg(t_imgsrc,f_path)
                                time.sleep(5)
                            if  os.path.isfile(f_path):
                                self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                            time.sleep(5)

                            # os.rename(filepath,filepath.replace(".html",".xxxx"))
                        elif (res.text.find("isexist!")>=0):
                            print(thisurl)
                    else:
                        print("txt is null")
            except AttributeError:
                print("AttributeError "+viewid)
            except FileNotFoundError:
                print("FileNotFoundError "+viewid)
            except BaseException:
                print("BaseException "+viewid)

    def suibianlu_s(self,page,userid):
        url="https://www.xxx.com/code_"+str(page)
        list_id=[]
        list_img=[]
        r=requests.get(url)
        r.encoding='utf-8'
        try:
            soup = BeautifulSoup(r.text,'html.parser')
            lis=soup.find_all('li',class_="clearfix" )
            for li in lis:
                #time.sleep(1)
                #print(li.find('a').get('href'))
                goodsid=li.find('a').get('href')
                if len(goodsid):
                    list_id.append(goodsid)
                    s=goodsid.split('/')
                    bhn=(s[len(s)-1]).replace(".html","")

                imgsrc=li.find('img').get('src')
                if len(imgsrc):
                    list_img.append(imgsrc)
                    #print(bhname)

                #print("-->"+li.get_text())
        except AttributeError:
            pass

        for goodsidurl in list_id:
            #print(goodsidurl)
            try:
                r=requests.get(goodsidurl)
                r.encoding='utf-8'
                time.sleep(3)
                print( time.asctime( time.localtime(time.time()) ))
                soup = BeautifulSoup(r.text,'html.parser')
                thisurl=goodsidurl
                bh=str(int(time.time()))+"-"+str(userid);
                tit=soup.find('h1',class_="i30 mb5").get_text()
                money=str(random.randint(10,30)*100)
                #print(tit.get_text())
                txt=str(soup.find('div',class_='info-con'))


                #time.sleep(10000000)
                if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
                    i=list_id.index(goodsidurl)
                    #s=thisurl.split('/')
                    t_imgsrc=list_img[i]
                    s=thisurl.split('/')
                    bhn=(s[len(s)-1]).replace(".html","")
                    imgsuffix=t_imgsrc.split(".")
                    bhname=bhn+"."+imgsuffix[len(imgsuffix)-1]
                    xsnum=0
                    if(1==random.randint(0,50)):
                        xsnum=random.randint(1,10)
                    url = "http://xxx/zhf/savedata.php"
                    data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
                    if len(txt)>1:
                        res = requests.post(url=url,data=data)
                        print(res.text)
                        if(res.text.find("success")>1):
                            f_path='d://upload//bh//'+bhname
                            if not os.path.isfile(f_path):
                                print("download img->"+t_imgsrc+" -> "+f_path)
                                with open(f_path,'wb') as f:
                                    img=requests.get(t_imgsrc)  #
                                    f.write(img.content)       #
                                f.close()
                                time.sleep(5)
                            self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                            time.sleep(5)

                            # os.rename(filepath,filepath.replace(".html",".xxxx"))
                        elif (res.text.find("isexist!")>=0):
                            print(thisurl)
                    else:
                        print("txt is null")
            except AttributeError:
                pass
            except FileNotFoundError:
                pass
            except BaseException:
                pass

    def  upload(self,filepath,userid):
        #FILE_OBJECT= open('order.log','r', encoding='UTF-8')
        soup = BeautifulSoup(open(filepath,'r',encoding='UTF-8'))
        #print (soup.prettify())
        thisurl=soup.find('a',id='thiseurl').get_text()

        bh= soup.find('a',id='bh').get_text()+"-"+str(userid)


        bhname=soup.find('a',id='bh_name').get_text()
        #burl="../upload/code/"+nbh;
        #print(bh)
        #ty1id=soup.find('a',id='ty1').get_text()
        #ty2id=soup.find('a',id='ty2').get_text()
        tit=soup.find('a',id='tit').get_text()
        money=str(soup.find('h1',id='money').get_text()).replace(',','')

        #txt=str(soup.find('div',id='txt'));
        # for (var i=0;i<document.getElementsByClassName('c_r_des lazyload').valueOf(0)[0].getElementsByTagName('img').length;i++){
          #var imgi=document.getElementsByClassName('c_r_des lazyload').valueOf(0)[0].getElementsByTagName('img').valueOf(0)[i].src
          #alert(imgi)
        #}
        imgs=soup.find_all('img',id="c_pic")
        for img in imgs:
            imgscr=str(img.get('src'))
            imglayscr=img.get('lay-src')
            if len(imglayscr)>1:
                img['src']="../upload/code"+self.getsrcname(imglayscr)
                img['lay-src']="../upload/code"+self.getsrcname(imglayscr)
            else:
                img['src']="../upload/code"+self.getsrcname(imgscr)

        txt=str(soup.find('div',id='txt'))
        #print (txt )
        #http://xx/savedata.php


        if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
            print("---->" + time.asctime( time.localtime(time.time()) ))
            url = "http://xx/zhf/savedata.php"
            data = {"thisurl":thisurl,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
            if len(txt)>1:
                res = requests.post(url=url,data=data)
                print(res.text)
                if(res.text.find("success")>1):
                    self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                    time.sleep(10)
                    # os.rename(filepath,filepath.replace(".html",".xxxx"))
                elif (res.text.find("isexist!")>=0):
                    pass
                try :
                    os.rename(filepath,filepath.replace(".html",".xxxx"))
                except FileExistsError:
                    os.remove(filepath)

                    #os.system("start D:\\Users\\aihao\\webimg.exe "+huzhanurl)
            else:
                print("txt is null")

    def auto_huzan(self):
        uploaddir="D:\\Users\\aihao\\UPLOAD GOODS"
        downloaddir="D:\\Users\\msi\\Downloads"
        vbpagedir="D:\\Users\\administrator\\documents\\visual studio 2015\\Projects\\webimg\\webimg\\bin\\Debug\\2020622"
        p_dir=vbpagedir
        dirs = os.listdir(p_dir  )
        index=0
        for file in dirs:
            index+=1
            print("progress"+str(index)+"/"+str(len(dirs)))
            f_p=os.path.join(p_dir,file)        
            if os.path.isfile(f_p):           
                if file.find(".html")>1:
                    #print(file)
                    try:
                        #getburimg(f_p)
                        self.upload(f_p,975);       
                        #break                               
                        time.sleep(5)
                        pass
                    except   UnicodeDecodeError:
                        print(f_p)
                       
    def auto_suibianlu_s(self):            
        maxpage=224
        cdir=os.path.abspath(os.path.dirname(__file__))
        file=cdir+"\\pyconfig.ini"   
        config = configparser.ConfigParser()        
        config.read(file)
        pagestr = int(config.get('page','start2') )    
        print("start in ->"+str(pagestr))    
        while 1:
            i=random.randint(0,len(self.idlist)-1)
            userid=self.idlist[i]
            if pagestr>maxpage:
                print("complete")
                break
            print ("current page is:"+str(pagestr)+"  "+ time.asctime( time.localtime(time.time()) ))
            time.sleep(5)
            self.suibianlu_s(pagestr,userid)           
            pagestr+=1
            config.set('page','start2', str(pagestr +1))            
            with open(file,'w') as configfile:
                config.write(configfile)
   
    def auto_w87zx(self):
         #w87zx(1,991)       
        maxpage=1444
        cdir=os.path.abspath(os.path.dirname(__file__))
        file=cdir+"\\pyconfig.ini"   
        config = configparser.ConfigParser()        
        config.read(file)
        pagestr = int(config.get('page','start3') )    
        print("start in ->"+str(pagestr))    
        while True:
            i=random.randint(0,len(self.idlist)-1)
            userid=self.idlist[i]
            if pagestr>maxpage:
                print("complete")
                break
            print ("current page is:"+str(pagestr)+"  "+ time.asctime( time.localtime(time.time()) ))
            time.sleep(5)
            self.w87zx(pagestr,userid)           
            pagestr+=1
            config.set('page','start3', str(pagestr +1))            
            with open(file,'w') as configfile:
                config.write(configfile)    

    def auto_zhisu(self):
        maxpage=1316  
        cdir=os.path.abspath(os.path.dirname(__file__))
        file=cdir+"\\pyconfig.ini"   
        config = configparser.ConfigParser()        
        config.read(file)
        pagestr = int(config.get('page','start4') )    
        print("start in ->"+str(pagestr))    
        while True:
            i=random.randint(0,len(self.idlist)-1)
            userid=self.idlist[i]
            if pagestr>maxpage:
                print("complete")
                break
            print ("current page is:"+str(pagestr)+"  "+ time.asctime( time.localtime(time.time()) ))
            time.sleep(5)
            self.zhisu(pagestr,userid)           
            pagestr+=1
            config.set('page','start4', str(pagestr +1))            
            with open(file,'w') as configfile:
                config.write(configfile)    

    def auto_haozan(self):
        maxpage=1355  
        cdir=os.path.abspath(os.path.dirname(__file__))
        file=cdir+"\\pyconfig.ini"   
        config = configparser.ConfigParser()        
        config.read(file)
        pagestr = int(config.get('page','start5') )    
        print("start in ->"+str(pagestr))    
        while 1:
            i=random.randint(0,len(self.idlist)-1)
            userid=self.idlist[i]
            if pagestr>maxpage:
                print("complete")
                break
            print ("current page is:"+str(pagestr)+"  "+ time.asctime( time.localtime(time.time()) ))
            time.sleep(5)
            self.haozhan(pagestr,userid)           
            pagestr+=1
            config.set('page','start5', str(pagestr +1))            
            with open(file,'w') as configfile:
                config.write(configfile)        
  
    def auto_office368(self):    
         #office368(2,975)  
        maxpage=999  
        cdir=os.path.abspath(os.path.dirname(__file__))
        file=cdir+"\\pyconfig.ini"   
        config = configparser.ConfigParser()        
        config.read(file)
        pagestr = int(config.get('page','start6') )    
        print("start in ->"+str(pagestr))    
        while True:
            i=random.randint(0,len(self.idlist)-1)
            userid=self.idlist[i]
            if pagestr>maxpage:
                print("complete")
                break
            print ("current page is:"+str(pagestr)+"  "+ time.asctime( time.localtime(time.time()) ))
            time.sleep(5)
            self.office368(pagestr,userid)           
            pagestr+=1
            config.set('page','start6', str(pagestr +1))            
            with open(file,'w') as configfile:
                config.write(configfile)    
    def zbj_seach(self):
        a3=self.alltypelist[0]
        id12=self.getservertype12id(a3[3],a3[4])
        id3=a3[0]
        name3=a3[8]
        userid=975
        #print(id12[0],id12[1],id3,name3)
        searchkey=name3
        url="https://xxx.zbj.com/search/f/?kw="+searchkey
        r=requests.get(url)
        soup = BeautifulSoup(r.text,'html.parser')
        divlistbox=soup.find('div',class_="witkey-list-grid j-service-provider-wrap")
        #print(divlistbox)
        a_s=divlistbox.find_all("a",class_='desc',target="_blank");
        alist=[]
        for a in a_s:
            a_href=a.get("href");           
            if a_href not in alist:                
                #print(a_href)  
                if not re.match(r'^http',a_href):
                    a_href="https:"+a_href;
                    alist.append(a_href)
                r2=requests.get(url=a_href);
                soup2= BeautifulSoup(r2.text,"html.parser")
                tit=str(soup2.find('h2',class_="yahei").get_text()) 
                #print(tit)
                bh_imgsrc=soup2.find("img",class_="service-case-img").get('src')
                #print(bh_imgsrc)
                money=str(soup2.find("span",class_="price").get_text())
                #print(money)
                txt=str(soup2.find(id="J-description"))
                #print(txt)
                bh=str(int(time.time()))+"-"+str(userid);
                #print(userid,bh,id12[0],id12[1],id3,tit,txt,money,money)
                self.insertServerdata(userid,bh,id12[0],id12[1],id3,tit,txt,money)
                break
    def web_scrollTo_end(self,tm=5):    
        js = "return action=document.body.scrollHeight"
        # 初始化现在滚动条所在高度为0
        height = 0
        # 当前窗口总高度
        new_height = self.browser.execute_script(js)
        while height < new_height:
            # 将滚动条调整至页面底部
            for i in range(height, new_height, 100):
                self.browser.execute_script('window.scrollTo(0, {})'.format(i))
                time.sleep(0.3)
            height = new_height
            time.sleep(0.3)
            new_height = self.browser.execute_script(js)
    def gettitissexist(self,tit):
        data={"tit":tit}
        url="http://xxx/zhf/savedata.php?method=gettitisexist"      
        r=requests.post(url,data)
        if r.text.find("1")>=0:
            return True
        else:
            return False    
    def huzan2(self,page):
        #System.setProperty("webdriver.Edge.driver", "D:\\Program Files\\Python36\\Scripts\\msedgedriver.exe");        
        url="https://www.xxx.com/code/page/"+str(page)  
        self.browser.get(url) 
        self.browser.maximize_window()
        
        #self.browser.minimize_window() 
        #self.web_scrollTo_end(1)
        html = BeautifulSoup(self.browser.page_source, "html.parser")   
        list_items=html.find("div",class_="list_items")
        if list_items:
            lias= list_items.find_all("a",class_="pic") 
            for a in lias:
                #print("slp 10 s")
                #time.sleep(10) 
                userid=self.getrandomuserid()
                aurl="https://www.xxx.com"+a.get("href")
                thisurl=aurl
                self.browser.get(aurl)
                html2 = BeautifulSoup(self.browser.page_source, "html.parser")
                if html2.find("div",class_="layui-layer-btn"):
                    print("is error product")
                    continue 
                if html2.find("div",class_="layui-layer-content"):
                    self.browser.execute_script("document.getElementsByClassName('layui-layer-ico layui-layer-close layui-layer-close2').valueOf()[0].click()")                              
                tit=trim(html2.find("div",class_="c_g_tit").get_text())
                #标题
                print(tit)  
                if self.gettitissexist(tit):
                    print("is exist000")
                    continue       
                titimg=self.browser.find_element_by_class_name("G-image")
                titimgnamesrc=titimg.get_attribute("src")
                i=titimgnamesrc.rfind('/')
                titimgnam=titimgnamesrc[i:].replace("/","")
                bhimgname=re.sub(r"(.jpg)|(.png)|(\.*)","",titimgnam)+".png"
                #展示图
                print(bhimgname)
                dirp=os.path.join("D:\\upload\\bh",bhimgname)
                titimg.screenshot(dirp)  #保存bhimg
                time.sleep(2)
                price=html2.find("div",class_="price").get_text()
                money=re.sub(r"¥|,|,","",price)
                #价格
                #print(money)
                ysweb=""
                c_g_spe=html2.find("a",class_="demo")
                if c_g_spe:
                    ysweb=c_g_spe.get("href") 
                else:
                    print("no ywsem continue") 
                    continue                      
                #演示网址    
                #print(ysweb)
                #self.web_scrollTo_end(1) 
                spatt=html2.find("ul",class_="c_r_par")
                lis=spatt.find_all("li")
                tysx=""
                for li in lis:
                    #cite=li.find("cite").get_text()
                    em=li.find("em").get_text()
                    em=re.sub(r'(\(.*\))|(\+.*)|(.*)|(其他)|(无)',"",em)
                    tysx+=str(em)+","
                #描述属性
                #self.browser.execute_script("document.body.style.zoom='0.8'")
                self.web_scrollTo_end(1)
                self.browser.execute_script("if(document.getElementsByClassName('c_r_menu fixed').length){document.getElementsByClassName('c_r_menu fixed').valueOf()[0].innerText=''}")
                #print(tysx)    
                ishidden="<a id='delatttrue' hidde></a>"
                txt=html2.find("div",id="c_aa")
                cpics=self.browser.find_elements_by_id("c_pic")
                drt=str(datetime.datetime.now().strftime("%Y%m%d"))
                ddrt="D:\\upload\\code"+drt
                if not os.path.isdir(ddrt):
                    os.mkdir(ddrt)
                cpicnamelist=[]
                for cpic in cpics:
                    cpici=cpic.get_attribute("src")
                    i=cpici.rfind('/')+1
                    cpciname=cpici[i:]
                    cpciname=re.sub(r"(.jpg)|(.png)|(\.*)","",cpciname)+".png"
                    cpicnamelist.append(cpciname)
                    #print(cpciname)
                    # dtime=                    
                    cpic.screenshot(ddrt+"\\"+cpciname)
                    time.sleep(2.5)
                imgs=txt.find_all("img")
                index=0
                for img in imgs:                     
                    del img['lay-src']                    
                    img['src']="/upload/code"+drt+"/"+ cpicnamelist[index]  
                    index+=1
                 

                    #删除img
                    #img.decompose()
                txt= ishidden+str(txt)
                #商品描述
                #print(txt)                                  
                if(   len(money)>1 and len(tit)>1 and len(txt)>1 ):  
                    bhname=bhimgname
                    bh=str(int(time.time()))+"-"+str(userid);
                    xsnum=0
                    if(1==random.randint(0,50)):
                        xsnum=random.randint(1,10)
                    url = "http://xxx/zhf/savedata.php"
                    data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname,"ysweb":ysweb,"tysx":tysx}
                    if len(txt)>1:
                        res = requests.post(url=url,data=data)
                        print("requests--->"+res.text)
                        if(res.text.find("success")>1):
                            f_path='d:\\upload\\bh\\'+bhname
                            if  os.path.isfile(f_path):                              
                                self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                                time.sleep(5)
                            # os.rename(filepath,filepath.replace(".html",".xxxx"))
                        elif (res.text.find("isexist001!")>=0):
                            print(thisurl)
                    else:
                        print("txt is null")  
                             
                    print("slp 5 s")
                    time.sleep(5)      
                   

    def auto_huzan2(self):
        cdir=os.path.abspath(os.path.dirname(__file__))
        file=cdir+"\\pyconfig.ini"   
        config = configparser.ConfigParser()        
        config.read(file)
        pagestr = int(config.get('page','start8') )    
        maxpage=9999
        #print("start in ->"+str(pagestr))    
        while 1:           
            if pagestr>maxpage:
                print("complete")
                break
            print("-------------------------------------------------------------------------------------------")
            print ("current page is:"+str(pagestr)+"  "+ time.asctime( time.localtime(time.time()) ),"slp 5 s")
            time.sleep(5)
            self.huzan2(pagestr)           
            pagestr+=1
            config.set('page','start8', str(pagestr+1))            
            with open(file,'w') as configfile:
                config.write(configfile)
if __name__ == "__main__":        
    cfun = mytool()
    func=8
    try:
        if 0:
            cfun.uploadfile("1563516747922.png","d:\\upload\\bh\\1563516747922.png",15,'1592801704-975')
        elif 0:
            url="https://www.xxx.com/upload/26076/1589429413-26076/0034519001589429461tp26076-1.jpg"
            f_path="C:\\Users\\msi\\Desktop\\picture.jpg"
            cfun.downloadimg(url,f_path)
            cfun.createthumbnail(f_path)
        elif 0:
            #suibianlu_s(105,975)
            pass
        elif 8==func:
            cfun.auto_huzan2()
            pass 
        elif 7==func:
            #print(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))    
            cfun.zbj_seach()
        elif 6==func:
            #办公采购
            cfun.auto_office368()
        elif 5==func:
            #haozhan  
            pass
            #cfun.auto_haozan()       
        elif 4==func:
            cfun.auto_zhisu()            
        elif 3==func:
            cfun.auto_w87zx()
        elif 2==func:
            cfun.auto_suibianlu_s()        
        elif 1==func:
            #huzan
            cfun.auto_huzan()
        else:
            print("please select func")  
    except BaseException:
        pass


发表评论:

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。

联系我 331434376    15629529961