yahoo search apiで画像取得(もっと綺麗に)
前のエントリで書いた画像取得スクリプトを改善。
- マルチバイト文字に対応。
- 保存先ディレクトリが区切り文字で終わっていなければ補完する
- 最大で950の画像ファイルを取得する
- 前よりソースが綺麗
#!/usr/bin/env python import os, sys, urllib, urllib2, urlparse from xml.etree.cElementTree import XML, ElementTree _appid = yourid _default = {"appid":_appid, "query":"", "type":"all", "results":10, "start":1, "format":"any", "adult_ok": 0, "coloration":"any"} def _makeurl(dic): base = "http://api.search.yahoo.co.jp/ImageSearchService/V1/imageSearch" follow = [x + "=" + str(dic[x]) for x in dic.keys()] follow = "&".join(follow) return base + "?" + follow def default(): return _default def setappid(id): _appid = id def _getxml(dic): url = _makeurl(dic) r = urllib2.urlopen(url) return r.read() def _fetch_urls(dic): etree = XML(_getxml(dic)) return [x[2].text for x in etree] # list of ClickUrl def _filter_fname(url): base = urlparse.urlparse(url)[2] return os.path.basename(base) def fetch_all_url(dic): lst = [] _default["appid"] = _appid for start in range(1, 902, dic["results"]): # 1, 51, ... 901 dic["start"] = start lst += _fetch_urls(dic) return lst def main(query = "python", dir="./"): if not os.path.isdir(dir): print dir, "is not directory" return if dir[-1] != os.sep: dir += os.sep dic = default() dic["query"] = urllib.quote(query) dic["result"] = 50 print "filling urls...", sys.stdout.flush() urls = fetch_all_url(dic) print "done" sys.stdout.flush() print urls[0] for url in urls: base = urlparse.urlparse(url)[2] fname = os.path.basename(base) try: urllib.urlretrieve(url, dir + fname) print "get %s" % url except: print "failure to get", url if __name__ == '__main__': if len(sys.argv) == 1: print "usage %s query dir" % os.path.basename(sys.argv[0]) main(sys.argv[1], sys.argv[2])