You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
122 lines
5.0 KiB
122 lines
5.0 KiB
# -*- coding: utf-8 -*-
|
|
##############################################################################
|
|
#
|
|
# Cybrosys Technologies Pvt. Ltd.
|
|
# Copyright (C) 2025-TODAY Cybrosys Technologies(<http://www.cybrosys.com>).
|
|
# Author: Vishnu kp(<https://www.cybrosys.com>)
|
|
# you can modify it under the terms of the GNU AFFERO GENERAL
|
|
# PUBLIC LICENSE (AGPL v3), Version 3.
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU AFFERO GENERAL PUBLIC LICENSE (AGPL v3) for more details.
|
|
#
|
|
# You should have received a copy of the GNU AFFERO GENERAL PUBLIC
|
|
# LICENSE (AGPL v3) along with this program.
|
|
# If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
##############################################################################
|
|
import imghdr
|
|
import posixpath
|
|
import re
|
|
import urllib.request
|
|
import urllib
|
|
|
|
|
|
class Bing:
|
|
"""Download images from bing"""
|
|
def __init__(self, query, limit, output_dir, adult, timeout, filter='',
|
|
verbose=True):
|
|
self.download_count = 0
|
|
self.query = query
|
|
self.output_dir = output_dir
|
|
self.adult = adult
|
|
self.filter = filter
|
|
self.verbose = verbose
|
|
self.seen = set()
|
|
assert isinstance(limit, int), "limit must be integer"
|
|
self.limit = limit
|
|
assert isinstance(timeout, int), "timeout must be integer"
|
|
self.timeout = timeout
|
|
|
|
self.page_counter = 0
|
|
self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
|
|
'AppleWebKit/537.11 (KHTML, like Gecko) '
|
|
'Chrome/23.0.1271.64 Safari/537.11',
|
|
'Accept': 'text/html,application/xhtml+xml,'
|
|
'application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
|
|
'Accept-Encoding': 'none',
|
|
'Accept-Language': 'en-US,en;q=0.8',
|
|
'Connection': 'keep-alive'}
|
|
|
|
def get_filter(self, shorthand):
|
|
"""Get the filter from bing"""
|
|
if shorthand in ["line", "linedrawing"]:
|
|
return "+filterui:photo-linedrawing"
|
|
elif shorthand == "photo":
|
|
return "+filterui:photo-photo"
|
|
elif shorthand == "clipart":
|
|
return "+filterui:photo-clipart"
|
|
elif shorthand in ["gif", "animatedgif"]:
|
|
return "+filterui:photo-animatedgif"
|
|
elif shorthand == "transparent":
|
|
return "+filterui:photo-transparent"
|
|
else:
|
|
return ""
|
|
|
|
def save_image(self, link, file_path):
|
|
"""Save image to directory"""
|
|
request = urllib.request.Request(link, None, self.headers)
|
|
image = urllib.request.urlopen(request, timeout=self.timeout).read()
|
|
if not imghdr.what(None, image):
|
|
raise ValueError('Invalid image, not saving {}\n'.format(link))
|
|
with open(str(file_path), 'wb') as path_string:
|
|
path_string.write(image)
|
|
|
|
def download_image(self, link):
|
|
"""Download the images using the url obtained"""
|
|
self.download_count += 1
|
|
# Get the image link
|
|
try:
|
|
path = urllib.parse.urlsplit(link).path
|
|
filename = posixpath.basename(path).split('?')[0]
|
|
file_type = filename.split(".")[-1]
|
|
if file_type.lower() not in ["jpe", "jpeg", "jfif", "exif", "tiff",
|
|
"gif", "bmp", "png", "webp", "jpg"]:
|
|
file_type = "jpg"
|
|
|
|
self.save_image(link, self.output_dir.joinpath("Image_{}.{}".format(
|
|
str(self.download_count), file_type)))
|
|
if self.verbose:
|
|
return link
|
|
|
|
except Exception:
|
|
self.download_count -= 1
|
|
self.seen.remove(link)
|
|
|
|
def run(self):
|
|
"""run the download function"""
|
|
while self.download_count < self.limit:
|
|
request_url = 'https://www.bing.com/images/async?q=' \
|
|
+ urllib.parse.quote_plus(self.query) \
|
|
+ '&first=' + str(self.page_counter) + '&count=' \
|
|
+ str(self.limit) \
|
|
+ '&adlt=' + self.adult + '&qft=' + (
|
|
'' if self.filter is None else self.get_filter(
|
|
self.filter))
|
|
request = urllib.request.Request(request_url, None,
|
|
headers=self.headers)
|
|
response = urllib.request.urlopen(request)
|
|
html = response.read().decode('utf8')
|
|
if html == "":
|
|
break
|
|
links = re.findall('murl":"(.*?)"', html)
|
|
|
|
for link in links:
|
|
if self.download_count < self.limit and link not in self.seen:
|
|
self.seen.add(link)
|
|
self.download_image(link)
|
|
self.page_counter += 1
|
|
return self.seen
|
|
|