Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1#!/usr/local/bin/python 

2# encoding: utf-8 

3""" 

4*Download resources from a list of URLs.  

5 

6There are options to rename all the downloaded resource, index the files, set differing download locations and pass basic authentication credentials.* 

7 

8:Author: 

9 David Young 

10""" 

11from __future__ import print_function 

12from future import standard_library 

13standard_library.install_aliases() 

14from builtins import zip 

15from builtins import str 

16import sys 

17import os 

18os.environ['TERM'] = 'vt100' 

19from fundamentals import tools 

20import urllib 

21 

22def multiobject_download( 

23 urlList, 

24 downloadDirectory, 

25 log, 

26 timeStamp=True, 

27 timeout=180, 

28 concurrentDownloads=10, 

29 resetFilename=False, 

30 credentials=False, 

31 longTime=False, 

32 indexFilenames=False 

33): 

34 """ 

35 *get multiple url documents and place them in specified download directory/directories* 

36 

37 **Key Arguments** 

38 

39  

40 - ``urlList`` -- list of document urls 

41 - ``downloadDirectory`` -- directory(ies) to download the documents to - can be one directory path or a list of paths the same length as urlList 

42 - ``log`` -- the logger 

43 - ``timestamp`` -- append a timestamp the name of the URL (ensure unique filenames) 

44 - ``longTime`` -- use a longer timestamp when appending to the filename (greater uniqueness) 

45 - ``timeout`` -- the timeout limit for downloads (secs) 

46 - ``concurrentDownloads`` -- the number of concurrent downloads allowed at any one time 

47 - ``resetFilename`` -- a string to reset all filenames to 

48 - ``credentials`` -- basic http credentials { 'username' : "...", "password", "..." } 

49 - ``indexFilenames`` -- prepend filenames with index (where url appears in urllist) 

50 

51 **Return** 

52 

53  

54 - list of timestamped documents (same order as the input urlList) 

55 

56 **Usage** 

57 

58 ```python 

59 # download the pages linked from the main list page 

60 from fundamentals.download import multiobject_download 

61 localUrls = multiobject_download( 

62 urlList=["https://www.python.org/dev/peps/pep-0257/","https://en.wikipedia.org/wiki/Docstring"], 

63 downloadDirectory="/tmp", 

64 log="log", 

65 timeStamp=True, 

66 timeout=180, 

67 concurrentDownloads=2, 

68 resetFilename=False, 

69 credentials=False, # { 'username' : "...", "password", "..." } 

70 longTime=True 

71 ) 

72 

73 print localUrls 

74 # OUT: ['/tmp/untitled_20160316t160650610780.html', '/tmp/Docstring_20160316t160650611136.html'] 

75 ``` 

76 

77 .. image:: https://i.imgur.com/QYoMm24.png width=600px 

78  

79 """ 

80 import sys 

81 import os 

82 import eventlet 

83 import socket 

84 import re 

85 import base64 

86 from fundamentals.download import _fetch, _dump_files_to_local_drive, append_now_datestamp_to_filename, extract_filename_from_url 

87 

88 ## >SETTINGS ## 

89 # TIMEOUT IN SECONDS 

90 timeout = float(timeout) 

91 socket.setdefaulttimeout(timeout) 

92 

93 ########################################################### 

94 # >ACTION(S) # 

95 ########################################################### 

96 # BUILD THE 2D ARRAY FOR MULTI_THREADED DOWNLOADS 

97 thisArray = [] 

98 bodies = [] 

99 localUrls = [] 

100 theseUrls = [] 

101 requestList = [] 

102 

103 totalCount = len(urlList) 

104 

105 # IF ONLY ONE DOWNLOAD DIRECORY 

106 if isinstance(downloadDirectory, ("".__class__, u"".__class__)): 

107 for i, url in enumerate(urlList): 

108 # EXTRACT THE FILENAME FROM THE URL 

109 if resetFilename and len(resetFilename): 

110 filename = resetFilename[i] 

111 else: 

112 filename = extract_filename_from_url(log, url) 

113 if indexFilenames: 

114 filename = """%(i)03d_%(filename)s""" % locals() 

115 

116 if not filename: 

117 from datetime import datetime, date, time 

118 now = datetime.now() 

119 filename = now.strftime("%Y%m%dt%H%M%S%f") 

120 

121 if(timeStamp): 

122 # APPEND TIMESTAMP TO THE FILENAME 

123 filename = append_now_datestamp_to_filename( 

124 log, filename, longTime=longTime) 

125 # GENERATE THE LOCAL FILE URL 

126 localFilepath = downloadDirectory + "/" + filename 

127 thisArray.extend([[url, localFilepath]]) 

128 

129 # GENERATE THE REQUESTS 

130 request = urllib.request.Request(url) 

131 if credentials != False: 

132 username = credentials["username"] 

133 password = credentials["password"] 

134 base64string = base64.encodestring( 

135 '%s:%s' % (username, password)).replace('\n', '') 

136 request.add_header("Authorization", "Basic %s" % base64string) 

137 requestList.append(request) 

138 

139 elif isinstance(downloadDirectory, list): 

140 

141 for u, d in zip(urlList, downloadDirectory): 

142 # EXTRACT THE FILENAME FROM THE URL 

143 if resetFilename: 

144 filename = resetFilename 

145 else: 

146 filename = extract_filename_from_url(log, url) 

147 

148 if not filename: 

149 continue 

150 

151 if(timeStamp): 

152 # APPEND TIMESTAMP TO THE FILENAME 

153 filename = append_now_datestamp_to_filename( 

154 log, filename) 

155 # GENERATE THE LOCAL FILE URL 

156 localFilepath = d + "/" + filename 

157 thisArray.extend([[u, localFilepath]]) 

158 log.debug(" about to download %s" % (u,)) 

159 

160 # GENERATE THE REQUESTS 

161 request = urllib.request.Request(u) 

162 

163 if credentials != False: 

164 log.debug('adding the credentials') 

165 username = credentials["username"] 

166 password = credentials["password"] 

167 base64string = base64.encodestring( 

168 '%s:%s' % (username, password)).replace('\n', '') 

169 request.add_header("Authorization", "Basic %s" % base64string) 

170 requestList.append(request) 

171 

172 pool = eventlet.GreenPool(concurrentDownloads) 

173 i = 0 

174 try: 

175 

176 log.debug( 

177 "starting mutli-threaded download batch - %s concurrent downloads" % 

178 (concurrentDownloads,)) 

179 log.debug('len(requestList): %s' % (len(requestList),)) 

180 for url, body in pool.imap(_fetch, requestList): 

181 urlNum = i + 1 

182 if urlNum > 1: 

183 # CURSOR UP ONE LINE AND CLEAR LINE 

184 sys.stdout.write("\x1b[1A\x1b[2K") 

185 percent = (float(urlNum) / float(totalCount)) * 100. 

186 print( 

187 " %(urlNum)s / %(totalCount)s (%(percent)1.1f%%) URLs downloaded" % locals()) 

188 

189 if(body): 

190 bodies.extend([body]) 

191 theseUrls.extend([thisArray[i][1]]) 

192 else: 

193 theseUrls.extend([None]) 

194 bodies.extend([None]) 

195 

196 # DUMP THE FILES FROM MEMORY EVERY CONCURRENT DOWNLOAD CYCLE 

197 if i % concurrentDownloads == 0: 

198 _dump_files_to_local_drive(bodies, theseUrls, log) 

199 localUrls.extend(theseUrls) 

200 # RESET THE TMP ARRAYS 

201 bodies = [] 

202 theseUrls = [] 

203 i += 1 

204 except Exception as e: 

205 log.error( 

206 "something went wrong with the mutli-threaded download : " + str(e) + "\n") 

207 

208 # DUMP REMAINING FILES TO THE LOCAL DRIVE 

209 _dump_files_to_local_drive(bodies, theseUrls, log) 

210 localUrls.extend(theseUrls) 

211 

212 return localUrls