yahoo search apiで画像取得(もっと綺麗に)

前のエントリで書いた画像取得スクリプトを改善。

  • マルチバイト文字に対応。
  • 保存先ディレクトリが区切り文字で終わっていなければ補完する
  • 最大で950の画像ファイルを取得する
  • 前よりソースが綺麗
#!/usr/bin/env python
import os, sys, urllib, urllib2, urlparse
from xml.etree.cElementTree import XML, ElementTree

_appid = yourid

_default = {"appid":_appid, "query":"", "type":"all", "results":10,
            "start":1, "format":"any", "adult_ok": 0,
            "coloration":"any"}

def _makeurl(dic):
    base = "http://api.search.yahoo.co.jp/ImageSearchService/V1/imageSearch"
    follow = [x + "=" + str(dic[x]) for x in dic.keys()]
    follow = "&".join(follow)
    return base + "?" + follow

def default():
    return _default

def setappid(id):
    _appid = id

def _getxml(dic):
    url = _makeurl(dic)
    r = urllib2.urlopen(url)
    return r.read()

def _fetch_urls(dic):
    etree = XML(_getxml(dic))
    return [x[2].text for x in etree] # list of ClickUrl

def _filter_fname(url):
    base = urlparse.urlparse(url)[2]
    return os.path.basename(base)

def fetch_all_url(dic):
    lst = []
    _default["appid"] = _appid
    for start in range(1, 902, dic["results"]): # 1, 51, ... 901
        dic["start"] = start
        lst += _fetch_urls(dic)
    return lst

def main(query = "python", dir="./"):
    if not os.path.isdir(dir):
        print dir, "is not directory"
        return
    
    if dir[-1] != os.sep:
        dir += os.sep
    
    dic = default()
    dic["query"] = urllib.quote(query)
    dic["result"] = 50

    print "filling urls...",
    sys.stdout.flush()
    urls = fetch_all_url(dic)
    print "done"
    sys.stdout.flush()

    print urls[0]
    for url in urls:
        base = urlparse.urlparse(url)[2]
        fname = os.path.basename(base)
        try:
            urllib.urlretrieve(url, dir + fname)
            print "get %s" % url
        except:
            print "failure to get", url
    
        
if __name__ == '__main__':
    if len(sys.argv) == 1:
        print "usage %s query dir" % os.path.basename(sys.argv[0])
    main(sys.argv[1], sys.argv[2])