添加链接
link管理
链接快照平台
  • 输入网页链接,自动生成快照
  • 标签化管理网页链接

python多线程下载文件

kaggle 前一段时间公布了imaterialist-product的竞赛,闲着无事就下载了一下数据集看一下,他的数据集是给出了每个图片的类别,id与url链接地址,下载需要从json格式文件中读取这些需要的信息,然后编写程序进行下载

首先就直接读出url地址,利用python的urllib包,单线程进行下载,速度特别慢,然后就思考采用多线程下载提高速度

单线程读取json文件并下载

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import  json as js
import time
import urllib.request
import socket
import urllib2
#import request
socket.setdefaulttimeout(30)  # timout=30s   时间长不反应就进行下一个文件的下载
json_path="json/train.json"
image_save="imertialist_image/train"
if not os.path.exists(image_save):
		os.makedirs(image_save)
for img_cla in range(2019):        ####数据共分为2019类
	if not os.path.exists(os.path.join(image_save,str(img_cla))):
		os.makedirs(os.path.join(image_save,str(img_cla)))
f=open(json_path)   #读取json文件
setting=js.load(f)
images=setting["images"]
for  img in images:
	img_url=img["url"]
	img_id=img["id"]
	img_class=str(img["class"])
#	if not os.path.exists(os.path.join(image_save,img_class)):
#		os.makedirs(os.path.join(image_save,img_class))
	if img["class"]>36: 
		print(img_url)
		try:
			urllib.request.urlretrieve(img_url, os.path.join(os.path.join(image_save,img_class),img_id))
		except urllib.error.HTTPError,e:
    		continue
		except urllib2.error.URLError,e:
    		continue
		except socket.timeout:
			continue
		print(img_class)
#	break
f.close()
print("end")	

下载速度特别慢,考虑采用python多线程下载,速度特别快

# -*- coding: utf-8 -*-
import os
from contextlib import closing
import threading
import requests
import json as js
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
#url 文件夹
json_path="json/train.json"
#输出文件夹
out_dir = "train"
thread_num = 400
#http请求超时设置
timeout = 30
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
for img_cla in range(2019):  #一共2019类,保存在不同文件夹中
    if not os.path.exists(os.path.join(out_dir,str(img_cla))):
        os.makedirs(os.path.join(out_dir,str(img_cla)))
def download(img_url, img_name, img_class):
    if os.path.isfile(os.path.join(os.path.join(out_dir, str(img_class)), img_name)):
        return    ####如果之前下载过这个文件,就跳过
    with closing(requests.get(img_url, stream=True, headers=headers, timeout=timeout)) as r:
        rc = r.status_code
        if 299 < rc or rc < 200:
            print ('returnCode%s\t%s' % (rc, img_url))
            return
        content_length = int(r.headers.get('content-length', '0'))
        if content_length == 0:
            print ('size0\t%s' % img_url)
            return
            with open(os.path.join(os.path.join(out_dir, str(img_class)), img_name), 'wb') as f:
                for data in r.iter_content(1024):
                    f.write(data)
        except:
            print('savefail\t%s' % img_url)
def get_img_url_generate():
    imgs=[]
    with open(json_path,'r') as f:
        setting=js.load(f)
        images=setting["images"]
        for img in images:
            imgs=[]
            img_url=img['url']
            img_id=img['id']
            img_class=img['class']
            imgs.append(img_url)
            imgs.append(img_id)
            imgs.append(img_class)
                if img_url:
                    yield imgs
            except:
                break
lock = threading.Lock()
def loop(imgs):
    print ('thread %s is running...' % threading.current_thread().name)
    while True:
            with lock:
                img_url,img_id,img_class = next(imgs)
                print(img_class)
        except StopIteration:
            break
            download(img_url, img_id, img_class)
        except:
            print ('exceptfail\t%s' % img_url)
    print ('thread %s is end...' % threading.current_thread().name)