原创文章,欢迎转载。转载请注明出处:http://blog.csdn.net/jmppok/article/details/16847697
前几天心血来潮,写了个Python程序,用来下载论坛上的图片。具体作用请小伙伴们自己开发(你懂的)...
Python 2.7版本
使用方法:httpdown_p.py url
例如下载“安卓论坛壁纸”可以通过命令:httpdown_p.py http://bbs.hiapk.com/forum-37-1.html
1.原理
包括三个py文件:httpdown_p.py analyseurl.py analysepic.py
其中httpdown_p.py是主文件,输入一个URL, 调用analyseurl分析该url网页中包含的所有子页面的url(如论坛首页中包含的url);
analyseurl.py 目前只分析了一级。所以使用的时候,要注意,必须该url页面中点击一下就能进入有图片的页面。
analysepic.py则根据analyseurl.py的分析结果,再提取该url页面中的所有图片。
最后再由httpdown_p.py下载这些图片。
2.问题
1)目前没有采用多线程下载,所以下的较慢;
2)图片存放位置为当前目录,没有细分目录;
3)之下载了大于40K的图片(该限制可在httpdown_p.py中修改);
4)重复的图片没有下载;
5)分析url时,解析的格式为"http://xxs.html"或"http://xx.htm",如果网站是相对url,这个还解析不出来。不过可以自行修改analyseurl.py ;
6)分析图片时,解析格式为"http://"开头,以['.jpg','jpeg','.png','.gif','.bmp']结尾,其他格式或相对地址也无法解析。
3.代码
httpdown_p.py#!/usr/bin/python # -*- coding: utf-8 -*- import urllib2,re,string,sys from time import sleep import md5 import json import analysepic,analyseurl out_dir='.' size_limit=40960 def http_down(url): try: #use proxy """ proxy_support = urllib2.ProxyHandler({'http':'http://1.179.128.3:8080'}) opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler) urllib2.install_opener(opener) """ #request = urllib2.Request(url) http_file = urllib2.urlopen(url) info=http_file.info() #for key,value in info.items(): # print ("%s = %s" % (key,value)) size = string.atoi(info['content-length']) #print 'size=%d' %(size) if size >= size_limit: filename = url[url.rindex('/')+1:] f = open(out_dir+'/'+filename,'wb') content = http_file.read() f.write(content) f.close() print 'Down: %s' %(url) else: print 'Igno: %s' %(url) http_file.close() except Exception,e: print 'download error: ',e,url def http_down_p(urls): for url in urls: http_down(url) def main(url): #pics = analysepic.analyse_pic("http://discovery.163.com/photoview/4T8F0001/39413.html?from=tj_xgtj#p=9C6GI6EJ4T8F0001") urls = analyseurl.analyse_url(url) print "%d url found!" %(len(urls)) all_pics=[] i=0 for u in urls: i+=1 pics = analysepic.analyse_pic(u) print "[%3d]%d pics found in '%s'" %(i,len(pics),u) for pic in pics: if not pic in all_pics: all_pics.append(pic) print "total pics %d" %(len(all_pics)) http_down_p(all_pics) if __name__=='__main__': if(len(sys.argv)<2): print "Usage: httpdown.py url" else: main(sys.argv[1])
analyseurl.py
#!/usr/bin/python # -*- coding: utf-8 -*- import urllib2,re,string,sys from time import sleep import md5 import json sufixs=['.html','htm'] def analyse_url(url): pic_urls=[] try: http_file = urllib2.urlopen(url) lines = http_file.readlines() for line in lines: new_line = string.lower(line) begin = new_line.find("http://") if(begin>0): end_index=new_line.find("\"",begin) end_str = '' end=-1 for suf in sufixs: end = new_line.find(suf,begin,end_index) if(end>0): end_str=suf break; end =end+len(end_str) if(begin>0 and end>0 and begin<end): pic_url=line[begin:end] pic_urls.append(pic_url) http_file.close() except Exception,e: print 'error: ',e return pic_urls if __name__=='__main__': if(len(sys.argv)<2): print "Usage: analyseurl.py url" else: result = analyse_url(sys.argv[1]) for item in result: print item
analysepic.py
#!/usr/bin/python # -*- coding: utf-8 -*- import urllib2,re,string,sys from time import sleep import md5 import json sufixs=['.jpg','jpeg','.png','.gif','.bmp'] def get_one_pic(l): new_line = string.lower(l) index = -1 end_str = '' for suf in sufixs: index = new_line.find(suf) if(index>0): end_str=suf break; if(index>0): begin=l.rfind("http:",l.rfind("\"",0,index),index) end =index+len(end_str) if(begin>0 and end>0 and begin<end): pic=l[begin:end] return (pic, end) return ("",-1) def analyse_pic(url): pic_urls=[] try: http_file = urllib2.urlopen(url) lines = http_file.readlines() for line in lines: pic_url,end_index=get_one_pic(line) while(len(pic_url)>0): pic_urls.append(pic_url) print pic_url line=line[end_index:] pic_url,end_index=get_one_pic(line) http_file.close() except Exception,e: print 'error: ',e return pic_urls if __name__=='__main__': if(len(sys.argv)<2): print "Usage: analysepic.py url" else: result = analyse_pic(sys.argv[1]) for item in result: print item
4.运行结果