Coverage for fundamentals/download/multiobject

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1#!/usr/local/bin/python

2# encoding: utf-8

3"""

4*Download resources from a list of URLs.

6There are options to rename all the downloaded resource, index the files, set differing download locations and pass basic authentication credentials.*

8:Author:

9 David Young

10"""

11from __future__ import print_function

12from future import standard_library

13standard_library.install_aliases()

14from builtins import zip

15from builtins import str

16import sys

17import os

18os.environ['TERM'] = 'vt100'

19from fundamentals import tools

20import urllib

22def multiobject_download(

23 urlList,

24 downloadDirectory,

25 log,

26 timeStamp=True,

27 timeout=180,

28 concurrentDownloads=10,

29 resetFilename=False,

30 credentials=False,

31 longTime=False,

32 indexFilenames=False

33):

34 """

35 *get multiple url documents and place them in specified download directory/directories*

37 **Key Arguments**

40 - ``urlList`` -- list of document urls

41 - ``downloadDirectory`` -- directory(ies) to download the documents to - can be one directory path or a list of paths the same length as urlList

42 - ``log`` -- the logger

43 - ``timestamp`` -- append a timestamp the name of the URL (ensure unique filenames)

44 - ``longTime`` -- use a longer timestamp when appending to the filename (greater uniqueness)

45 - ``timeout`` -- the timeout limit for downloads (secs)

46 - ``concurrentDownloads`` -- the number of concurrent downloads allowed at any one time

47 - ``resetFilename`` -- a string to reset all filenames to

48 - ``credentials`` -- basic http credentials { 'username' : "...", "password", "..." }

49 - ``indexFilenames`` -- prepend filenames with index (where url appears in urllist)

51 **Return**

54 - list of timestamped documents (same order as the input urlList)

56 **Usage**

58 ```python

59 # download the pages linked from the main list page

60 from fundamentals.download import multiobject_download

61 localUrls = multiobject_download(

62 urlList=["https://www.python.org/dev/peps/pep-0257/","https://en.wikipedia.org/wiki/Docstring"],

63 downloadDirectory="/tmp",

64 log="log",

65 timeStamp=True,

66 timeout=180,

67 concurrentDownloads=2,

68 resetFilename=False,

69 credentials=False, # { 'username' : "...", "password", "..." }

70 longTime=True

71 )

73 print localUrls

74 # OUT: ['/tmp/untitled_20160316t160650610780.html', '/tmp/Docstring_20160316t160650611136.html']

75 ```

77 .. image:: https://i.imgur.com/QYoMm24.png width=600px

79 """

80 import sys

81 import os

82 import eventlet

83 import socket

84 import re

85 import base64

86 from fundamentals.download import _fetch, _dump_files_to_local_drive, append_now_datestamp_to_filename, extract_filename_from_url

88 ## >SETTINGS ##

89 # TIMEOUT IN SECONDS

90 timeout = float(timeout)

91 socket.setdefaulttimeout(timeout)

93 ###########################################################

94 # >ACTION(S) #

95 ###########################################################

96 # BUILD THE 2D ARRAY FOR MULTI_THREADED DOWNLOADS

97 thisArray = []

98 bodies = []

99 localUrls = []

100 theseUrls = []

101 requestList = []

102

103 totalCount = len(urlList)

104

105 # IF ONLY ONE DOWNLOAD DIRECORY

106 if isinstance(downloadDirectory, ("".__class__, u"".__class__)):

107 for i, url in enumerate(urlList):

108 # EXTRACT THE FILENAME FROM THE URL

109 if resetFilename and len(resetFilename):

110 filename = resetFilename[i]

111 else:

112 filename = extract_filename_from_url(log, url)

113 if indexFilenames:

114 filename = """%(i)03d_%(filename)s""" % locals()

115

116 if not filename:

117 from datetime import datetime, date, time

118 now = datetime.now()

119 filename = now.strftime("%Y%m%dt%H%M%S%f")

120

121 if(timeStamp):

122 # APPEND TIMESTAMP TO THE FILENAME

123 filename = append_now_datestamp_to_filename(

124 log, filename, longTime=longTime)

125 # GENERATE THE LOCAL FILE URL

126 localFilepath = downloadDirectory + "/" + filename

127 thisArray.extend([[url, localFilepath]])

128

129 # GENERATE THE REQUESTS

130 request = urllib.request.Request(url)

131 if credentials != False:

132 username = credentials["username"]

133 password = credentials["password"]

134 base64string = base64.encodestring(

135 '%s:%s' % (username, password)).replace('\n', '')

136 request.add_header("Authorization", "Basic %s" % base64string)

137 requestList.append(request)

138

139 elif isinstance(downloadDirectory, list):

140

141 for u, d in zip(urlList, downloadDirectory):

142 # EXTRACT THE FILENAME FROM THE URL

143 if resetFilename:

144 filename = resetFilename

145 else:

146 filename = extract_filename_from_url(log, url)

147

148 if not filename:

149 continue

150

151 if(timeStamp):

152 # APPEND TIMESTAMP TO THE FILENAME

153 filename = append_now_datestamp_to_filename(

154 log, filename)

155 # GENERATE THE LOCAL FILE URL

156 localFilepath = d + "/" + filename

157 thisArray.extend([[u, localFilepath]])

158 log.debug(" about to download %s" % (u,))

159

160 # GENERATE THE REQUESTS

161 request = urllib.request.Request(u)

162

163 if credentials != False:

164 log.debug('adding the credentials')

165 username = credentials["username"]

166 password = credentials["password"]

167 base64string = base64.encodestring(

168 '%s:%s' % (username, password)).replace('\n', '')

169 request.add_header("Authorization", "Basic %s" % base64string)

170 requestList.append(request)

171

172 pool = eventlet.GreenPool(concurrentDownloads)

173 i = 0

174 try:

175

176 log.debug(

177 "starting mutli-threaded download batch - %s concurrent downloads" %

178 (concurrentDownloads,))

179 log.debug('len(requestList): %s' % (len(requestList),))

180 for url, body in pool.imap(_fetch, requestList):

181 urlNum = i + 1

182 if urlNum > 1:

183 # CURSOR UP ONE LINE AND CLEAR LINE

184 sys.stdout.write("\x1b[1A\x1b[2K")

185 percent = (float(urlNum) / float(totalCount)) * 100.

186 print(

187 " %(urlNum)s / %(totalCount)s (%(percent)1.1f%%) URLs downloaded" % locals())

188

189 if(body):

190 bodies.extend([body])

191 theseUrls.extend([thisArray[i][1]])

192 else:

193 theseUrls.extend([None])

194 bodies.extend([None])

195

196 # DUMP THE FILES FROM MEMORY EVERY CONCURRENT DOWNLOAD CYCLE

197 if i % concurrentDownloads == 0:

198 _dump_files_to_local_drive(bodies, theseUrls, log)

199 localUrls.extend(theseUrls)

200 # RESET THE TMP ARRAYS

201 bodies = []

202 theseUrls = []

203 i += 1

204 except Exception as e:

205 log.error(

206 "something went wrong with the mutli-threaded download : " + str(e) + "\n")

207

208 # DUMP REMAINING FILES TO THE LOCAL DRIVE

209 _dump_files_to_local_drive(bodies, theseUrls, log)

210 localUrls.extend(theseUrls)

211

212 return localUrls

Coverage for fundamentals/download/multiobject_download.py : 0%

96 statements 0 run 96 missing 0 excluded