一、代码
import requests,bs4,csv,os,re,time '''采集商品url''' def shopifylist(url): while True: try: res=requests.get(url,timeout=30) res.encoding = res.apparent_encoding print('请求',url,'状态',res.status_code) res.raise_for_status()#如果返回状态不是200,则抛出异常 break except: timeout=3 print('链接失败,等待',timeout,'秒重试') time.sleep(timeout) print('') print('重新链接中') print('链接顺畅,开始获取商品链接') noStarchSoup=bs4.BeautifulSoup(res.text,'html.parser')#html.parser 指定解析器 url=noStarchSoup.select('.product-card.sc-pb-element') for i in range(len(url)): imgurl='https://tribalhollywood.com'+url[i].get('href') print('获取产品url') shopify(imgurl,site)#调用采集内容方法 print('\n') '''采集商品url结束''' '''采集商品内容开始''' def shopify(url,site): print('开始请求产品页面',url) while True: try: res=requests.get(url,timeout=30) res.encoding = res.apparent_encoding print('成功请求商品页面:',res.status_code) res.raise_for_status()#如果下载发生问题,就抛出异常 break except: print('请求商品页面',url,'失败,重新链接') noStarchSoup=bs4.BeautifulSoup(res.text,'html.parser') #匹配class属性为‘wc-block-grid__product-title’的内容 name=noStarchSoup.select('.product-single__title') name=name[0].getText() price=noStarchSoup.select('.product-single__price') price=price[0].getText() price=re.sub(' ','',price) price=re.sub('\n','',price) #特别注意class="rte product-single__description"只需要product-single__description des=noStarchSoup.select('.product-single__description') des=des[0].getText() des=re.sub('Hollywood','customadd.com',des)#替换版权信息 img=noStarchSoup.select('#ProductThumbs-product-template img') if img==[]: img=noStarchSoup.select('.sc-pb-element1') l=img[0].get('src') l='http:'+l l=re.sub('_960x','',l) else: l=[] for i in range(len(img)): imgurl=img[i].get('src') imgurl=re.sub('_160x160','',imgurl) l.append('https:'+imgurl) l=','.join(l) fileHeader=['标题','产品url','价格','描述','图片'] file=[name,url,price,des,l] #文件存储的地方,文件夹需要事先创建,并指定文件的格式为utf-8 while True: try: csvFile=open(site,'a',encoding='utf-8') break except: print('') print(site+'文件写入失败,重试中。。。。。') time.sleep(5) size=os.path.getsize(site)#判断文件大小,如果文件大于0则表示文件有内 writer=csv.writer(csvFile) if size==0: writer.writerow(fileHeader) writer.writerow(file) csvFile.close() else: writer.writerow(file) csvFile.close() print('采集成功!') '''采集内容结束''' #urlpro=str(input('输入要采集的商品列表')) urlpro='https://www.tribalhollywood.com/collections/mens-necklaces' site='C:\\Users\\Kevin\\Desktop\\mouse\\mens-necklaces1.csv' nt=['我不是空的'] n=1 while nt!=[]: url=urlpro+'?page='+str(n) prourl=shopifylist(url)#调用采集列表方法 print('成功采集',n,'页') n=n+1 res=requests.get(url) res.raise_for_status() noStarchSoup=bs4.BeautifulSoup(res.text,'html.parser') nt=noStarchSoup.select('.next') print('全部采集完毕!!')
https://zhuanlan.zhihu.com/p/101377117