博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
切换窗口
阅读量:6039 次
发布时间:2019-06-20

本文共 4249 字,大约阅读时间需要 14 分钟。

#!/usr/bin/env python

# -*- coding:utf-8 -*-
from multiprocessing import Pool
import os, time, random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from pyquery import PyQuery as pq
import pymysql
import asdl

class class_name(object):

class_var = 'null' #类变量
@staticmethod
def static_method():
if class_name.class_var == 'null' :
db = pymysql.connect(host="47.94.36.26",user="seo",passwd='djAcfKNHxF',db='seo',charset='utf8')
class_name.class_var = db
print('11111111111111111111111111')
else:
print('------------------')
pass
return class_name.class_var
#更新数据
def save_num(num,i):
str_num = str(num)
#初始排名
initial = i.get('initial_rank')
#唯一id
gid = i.get('id')
class_instance = class_name()
db = class_instance.static_method()
cursor = db.cursor(cursor=pymysql.cursors.DictCursor)
# SQL 更新语句
if int(initial) == 0 :
initial_rank = " , initial_rank = "+str_num
initial = str_num
else:
initial_rank = " "
#今日点击
num = " new_rank = " + str_num
#今日变化
rang = " , rang = " + " new_rank - " + str_num
#整体变化
total_rang = " , total_rang = initial_rank - " + str_num
sql = "update ganen_seo_task set "+num+initial_rank+rang+total_rang+" , click = click+1 where id = " + str(gid)
print(sql)
# SQL 更新语句
try:
# 执行SQL语句
cursor.execute(sql)
# 提交到数据库执行
db.commit()
print('ok')
except:
# 发生错误时回滚
db.rollback()
def find(wait,url,page_limit,i_data,driver):
LEGNTH = 0
num = 0
while LEGNTH == 0:
time.sleep(0.001)
doc = pq(driver.page_source)
div = doc(doc.html()).find('.results').children('div')
LEGNTH = div.length
page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pagebar_container > span')))
p = int(page.text)
title = ''
n = 0
for i in div:
n += 1
cite = doc(i).find('cite:contains("'+url+'")')
if cite :
title = doc(i).find('h3').text()
if title:
break;
if title :
content = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT,title)))
content.click()
num = ((p-1)*10 + n)
save_num(num,i_data)
time.sleep(3)
handle = driver.window_handles
driver.switch_to.window(handle[-1])
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(2)
driver.execute_script('window.scrollTo(0,0)')
time.sleep(1)
driver.quit()

else:

if p < page_limit :
but = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.np')))
but.click()
find(wait,url,page_limit,i_data,driver)
else:
num = 0
save_num(num,i_data)
driver.quit()

def find_yzm(wait,driver):
try:
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.np')))
return 1
except:
driver.quit()
def long_time_task(key,i,n):
#ip控制
if n % 2000 == 0 :
print(n)
asdl.main()
time.sleep(1)
options=webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
driver=webdriver.Chrome(chrome_options=options)
driver.get('https://www.sogou.com')
input = driver.find_element_by_id('query')
#key = 'python'
input.send_keys(key)
input.send_keys(Keys.ENTER)
wait = WebDriverWait(driver,8)
yzm = find_yzm(wait,driver)
if yzm:
url = i.get('domain')
page_limit = i.get('page_limit')
#url = 'www.py7thon.org'
find(wait,url,page_limit,i,driver)

 

if __name__=='__main__':

# 打开数据库连接

class_instance = class_name()
db = class_instance.static_method()
cursor = db.cursor(cursor=pymysql.cursors.DictCursor)
# sql 查询语句
sql = 'select * from ganen_seo_group where item = "搜狗PC搜索" order by id asc'
cursor.execute(sql)
results = cursor.fetchall()
if results:
fname = ''
for row in results:
fname += str(row.get('id')) + ','
gro_id = fname.strip(',')
sql = "select id,keywords,domain,initial_rank,page_limit from ganen_seo_task where gro_id in ("+gro_id+") and status = 0 and click < click_limit limit 100"
cursor.execute(sql)
results = cursor.fetchall()
n = 0
if results:
p = Pool(8)
for i in results:
key = i.get('keywords')
res = p.apply_async(long_time_task, args=(key,i,n))
n += 1
p.close()
p.join()
else:
print('没有需要执行的任务')
else:
print('搜狗PC搜索没有值')

 

转载于:https://www.cnblogs.com/simadongyang/p/8971253.html

你可能感兴趣的文章
php配置文件php.ini中文详解
查看>>
关于Tomcat配置相关总结
查看>>
安装PDO_MYSQL遇到的问题:error: Cannot find MySQL header files under
查看>>
CocoaPods最新安装及跳过pod setup快速安装教程
查看>>
必须用C模拟OS?
查看>>
JavaScript引入
查看>>
ARM9代码分析启动MAIN.C
查看>>
JSON
查看>>
4.3 Verilog练习(2)
查看>>
浅谈html5某些新元素的用途
查看>>
csv文件的操作
查看>>
快排序算法
查看>>
坑爹的Mysql
查看>>
每天进步一点点--&gt;函数fseek() 使用方法
查看>>
Unity Editor类常用方法
查看>>
Code Review中的几个提示
查看>>
AUC(Area Under roc Curve)学习笔记
查看>>
Flash和js交互的效率分析
查看>>
Linux高性能server编程——Linux网络基础API及应用
查看>>
office编程必不可少
查看>>