#-*-coding: utf-8 -*- import sys import time from random import choice from bs4 import BeautifulSoup from pprint import pprint from random import randint import requesocks import urllib import stem import stem.connection from stem import Signal from stem.control import Controller #return a random user agent to initiate HTTP request def getUserAgent(): user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko','Googlebot/2.1 (+http://www.google.com/bot.html)','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'] return choice(user_agents) #restart controller def restart_Con(): with Controller.from_port(port = 9051) as controller: controller.authenticate(password = 'SUPERPASSWORD') controller.signal(Signal.NEWNYM) controller.close() #Crawling my website def check_UCAKU(searchstring): print("Crawling Ucaku......") session = requesocks.session() response = session.get('http://httpbin.org/ip') print "Original IP is: " + response.text.replace("\n", "") session.proxies = {'http': 'socks5://localhost:9050', 'https': 'socks5://localhost:9050'} response = session.get('http://httpbin.org/ip') print "(+) Tor IP is: " + response.text.replace("\n", "") q_search = urllib.quote(searchstring) weblink = "http://www.ucaku.com/?s="+q_search print(weblink) headers = {'User-Agent': getUserAgent()} try: r = session.get(weblink,headers=headers,verify=False,timeout = 30) soup = BeautifulSoup(r.text,'html.parser') search_result_data = soup.find_all("div",{"class","post_text_inner"}) if search_result_data: for i in range(len(search_result_data)): search_result_data2 = search_result_data[i].find_all("h2") search_result_titles = search_result_data2[0].find_all("a") print(search_result_titles[0].text) except: print("No item found") reload(sys) sys.setdefaultencoding('utf8') search_items = ['crawl', 'data', 'application'] for row in range(len(search_items)): restart_Con() check_UCAKU(search_items[row]) time.sleep(randint(3,8)) #make it look more human-search like