-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathflickr_scraper.py
More file actions
87 lines (68 loc) · 3.04 KB
/
flickr_scraper.py
File metadata and controls
87 lines (68 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
import flickr
import urllib
from PIL import Image
import scipy, StringIO
class FlickrScraper(object):
def __init__(self):
api_key = 'f49f4273470a5bb1504142d20aeee4d6'
api_secret = '0156db5ea3a6a21b'
flickr.API_KEY = api_key
flickr.API_SECRET = api_secret
print "This product uses the Flickr API but is not endorsed or certified by Flickr." #legal
def get_url_proper(self, photo): #but slow, many API calls
return photo.getURL(size='Square', urlType='source')
def get_url(self, photo): #not good, but fast
return u'https://farm6.staticflickr.com/{:s}/{:s}_{:s}_s.jpg'.format(photo.server, photo.id, photo.secret)
def scrapeTag(self, tags, per_page, page=1, sort='interestingness=desc'):
photos = flickr.photos_search(tags=tags, per_page=per_page, page=page, sort=sort)
urls = []
for photo in photos:
urls.append(self.get_url(photo))
return urls
def fetchFiles(self, urls):
files = []
for url in urls:
file, mime = urllib.urlretrieve(url)
files.append(file)
return files
def imageToArray(self, im):
arr = scipy.array(im)
if len(arr.shape) == 2:
# greyscale handling in the scraper might be the best way
# it certainly is the first possibility, so negate the requirement for further handling
arr = arr.reshape((arr.shape[0], arr.shape[1], 1))
arr = scipy.concatenate((arr, arr, arr), axis=2)
else:
if arr.shape[2] == 4: # image with alpha channel
arr = arr[:,:,:3]
if not arr.size == 75*75*3: # the proper size we need
print "Array rejected with shape ", arr.shape
arr = scipy.zeros((75,75,3), dtype=scipy.uint8)
return arr.reshape((1,75,75,3))
def fetchFileData(self, url, filename=None):
arr = None
while arr is None:
try:
if (filename == None):
im = Image.open(urllib.urlretrieve(url)[0])
else:
im = Image.open(urllib.urlretrieve(url, filename)[0])
arr = self.imageToArray(im)
except:
# print "******ERRORS*********"
arr = None
return arr
class FlickrScraperDummy(FlickrScraper):
def __init__(self, path):
self.path = path
print "Using stored images from location : %s" % self.path
def get_url_proper(self, photo):
raise NotImplementedError("Dummy scraper: no flickr connection!")
def scrapeTag(self, tags, per_page, page=1, sort='interestingness=desc'):
raise NotImplementedError("Dummy scraper: no flickr connection!")
def fetchFiles(self, urls):
raise NotImplementedError("Dummy scraper: no flickr connection!")
def fetchFileData(self, url, filename=None):
im = Image.open(self.path + url)
return self.imageToArray(im)